genattrtab.c (write_header): Include hash-set.h...
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tm_p.h"
40 #include "target.h"
41 #include "predict.h"
42 #include "hard-reg-set.h"
43 #include "input.h"
44 #include "function.h"
45 #include "dominance.h"
46 #include "cfg.h"
47 #include "basic-block.h"
48 #include "gimple-pretty-print.h"
49 #include "tree-ssa-alias.h"
50 #include "internal-fn.h"
51 #include "tree-eh.h"
52 #include "gimple-expr.h"
53 #include "is-a.h"
54 #include "gimple.h"
55 #include "gimplify.h"
56 #include "gimple-iterator.h"
57 #include "gimplify-me.h"
58 #include "gimple-ssa.h"
59 #include "tree-phinodes.h"
60 #include "ssa-iterators.h"
61 #include "stringpool.h"
62 #include "tree-ssanames.h"
63 #include "tree-ssa-loop-ivopts.h"
64 #include "tree-ssa-loop-manip.h"
65 #include "tree-ssa-loop.h"
66 #include "dumpfile.h"
67 #include "cfgloop.h"
68 #include "tree-chrec.h"
69 #include "tree-scalar-evolution.h"
70 #include "tree-vectorizer.h"
71 #include "diagnostic-core.h"
72 #include "hash-map.h"
73 #include "plugin-api.h"
74 #include "ipa-ref.h"
75 #include "cgraph.h"
76 /* Need to include rtl.h, expr.h, etc. for optabs. */
77 #include "expr.h"
78 #include "insn-codes.h"
79 #include "optabs.h"
80 #include "builtins.h"
81 #include "varasm.h"
82
83 /* Return true if load- or store-lanes optab OPTAB is implemented for
84 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
85
86 static bool
87 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
88 tree vectype, unsigned HOST_WIDE_INT count)
89 {
90 machine_mode mode, array_mode;
91 bool limit_p;
92
93 mode = TYPE_MODE (vectype);
94 limit_p = !targetm.array_mode_supported_p (mode, count);
95 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
96 MODE_INT, limit_p);
97
98 if (array_mode == BLKmode)
99 {
100 if (dump_enabled_p ())
101 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
102 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
103 GET_MODE_NAME (mode), count);
104 return false;
105 }
106
107 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
108 {
109 if (dump_enabled_p ())
110 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
111 "cannot use %s<%s><%s>\n", name,
112 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
113 return false;
114 }
115
116 if (dump_enabled_p ())
117 dump_printf_loc (MSG_NOTE, vect_location,
118 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
119 GET_MODE_NAME (mode));
120
121 return true;
122 }
123
124
125 /* Return the smallest scalar part of STMT.
126 This is used to determine the vectype of the stmt. We generally set the
127 vectype according to the type of the result (lhs). For stmts whose
128 result-type is different than the type of the arguments (e.g., demotion,
129 promotion), vectype will be reset appropriately (later). Note that we have
130 to visit the smallest datatype in this function, because that determines the
131 VF. If the smallest datatype in the loop is present only as the rhs of a
132 promotion operation - we'd miss it.
133 Such a case, where a variable of this datatype does not appear in the lhs
134 anywhere in the loop, can only occur if it's an invariant: e.g.:
135 'int_x = (int) short_inv', which we'd expect to have been optimized away by
136 invariant motion. However, we cannot rely on invariant motion to always
137 take invariants out of the loop, and so in the case of promotion we also
138 have to check the rhs.
139 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
140 types. */
141
142 tree
143 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
144 HOST_WIDE_INT *rhs_size_unit)
145 {
146 tree scalar_type = gimple_expr_type (stmt);
147 HOST_WIDE_INT lhs, rhs;
148
149 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
150
151 if (is_gimple_assign (stmt)
152 && (gimple_assign_cast_p (stmt)
153 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
154 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
155 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
156 {
157 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
158
159 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
160 if (rhs < lhs)
161 scalar_type = rhs_type;
162 }
163
164 *lhs_size_unit = lhs;
165 *rhs_size_unit = rhs;
166 return scalar_type;
167 }
168
169
170 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
171 tested at run-time. Return TRUE if DDR was successfully inserted.
172 Return false if versioning is not supported. */
173
174 static bool
175 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
176 {
177 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
178
179 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
180 return false;
181
182 if (dump_enabled_p ())
183 {
184 dump_printf_loc (MSG_NOTE, vect_location,
185 "mark for run-time aliasing test between ");
186 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
187 dump_printf (MSG_NOTE, " and ");
188 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
189 dump_printf (MSG_NOTE, "\n");
190 }
191
192 if (optimize_loop_nest_for_size_p (loop))
193 {
194 if (dump_enabled_p ())
195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
196 "versioning not supported when optimizing"
197 " for size.\n");
198 return false;
199 }
200
201 /* FORNOW: We don't support versioning with outer-loop vectorization. */
202 if (loop->inner)
203 {
204 if (dump_enabled_p ())
205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
206 "versioning not yet supported for outer-loops.\n");
207 return false;
208 }
209
210 /* FORNOW: We don't support creating runtime alias tests for non-constant
211 step. */
212 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
213 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
214 {
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not yet supported for non-constant "
218 "step\n");
219 return false;
220 }
221
222 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
223 return true;
224 }
225
226
227 /* Function vect_analyze_data_ref_dependence.
228
229 Return TRUE if there (might) exist a dependence between a memory-reference
230 DRA and a memory-reference DRB. When versioning for alias may check a
231 dependence at run-time, return FALSE. Adjust *MAX_VF according to
232 the data dependence. */
233
234 static bool
235 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
236 loop_vec_info loop_vinfo, int *max_vf)
237 {
238 unsigned int i;
239 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
240 struct data_reference *dra = DDR_A (ddr);
241 struct data_reference *drb = DDR_B (ddr);
242 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
243 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
244 lambda_vector dist_v;
245 unsigned int loop_depth;
246
247 /* In loop analysis all data references should be vectorizable. */
248 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
249 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
250 gcc_unreachable ();
251
252 /* Independent data accesses. */
253 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
254 return false;
255
256 if (dra == drb
257 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
258 return false;
259
260 /* Even if we have an anti-dependence then, as the vectorized loop covers at
261 least two scalar iterations, there is always also a true dependence.
262 As the vectorizer does not re-order loads and stores we can ignore
263 the anti-dependence if TBAA can disambiguate both DRs similar to the
264 case with known negative distance anti-dependences (positive
265 distance anti-dependences would violate TBAA constraints). */
266 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
267 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
268 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
269 get_alias_set (DR_REF (drb))))
270 return false;
271
272 /* Unknown data dependence. */
273 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
274 {
275 /* If user asserted safelen consecutive iterations can be
276 executed concurrently, assume independence. */
277 if (loop->safelen >= 2)
278 {
279 if (loop->safelen < *max_vf)
280 *max_vf = loop->safelen;
281 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
282 return false;
283 }
284
285 if (STMT_VINFO_GATHER_P (stmtinfo_a)
286 || STMT_VINFO_GATHER_P (stmtinfo_b))
287 {
288 if (dump_enabled_p ())
289 {
290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
291 "versioning for alias not supported for: "
292 "can't determine dependence between ");
293 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
294 DR_REF (dra));
295 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
296 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
297 DR_REF (drb));
298 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
299 }
300 return true;
301 }
302
303 if (dump_enabled_p ())
304 {
305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
306 "versioning for alias required: "
307 "can't determine dependence between ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (dra));
310 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (drb));
313 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
314 }
315
316 /* Add to list of ddrs that need to be tested at run-time. */
317 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
318 }
319
320 /* Known data dependence. */
321 if (DDR_NUM_DIST_VECTS (ddr) == 0)
322 {
323 /* If user asserted safelen consecutive iterations can be
324 executed concurrently, assume independence. */
325 if (loop->safelen >= 2)
326 {
327 if (loop->safelen < *max_vf)
328 *max_vf = loop->safelen;
329 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
330 return false;
331 }
332
333 if (STMT_VINFO_GATHER_P (stmtinfo_a)
334 || STMT_VINFO_GATHER_P (stmtinfo_b))
335 {
336 if (dump_enabled_p ())
337 {
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias not supported for: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
342 DR_REF (dra));
343 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
344 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
345 DR_REF (drb));
346 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
347 }
348 return true;
349 }
350
351 if (dump_enabled_p ())
352 {
353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
354 "versioning for alias required: "
355 "bad dist vector for ");
356 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
357 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
358 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
359 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
360 }
361 /* Add to list of ddrs that need to be tested at run-time. */
362 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
363 }
364
365 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
366 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
367 {
368 int dist = dist_v[loop_depth];
369
370 if (dump_enabled_p ())
371 dump_printf_loc (MSG_NOTE, vect_location,
372 "dependence distance = %d.\n", dist);
373
374 if (dist == 0)
375 {
376 if (dump_enabled_p ())
377 {
378 dump_printf_loc (MSG_NOTE, vect_location,
379 "dependence distance == 0 between ");
380 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
381 dump_printf (MSG_NOTE, " and ");
382 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
383 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
384 }
385
386 /* When we perform grouped accesses and perform implicit CSE
387 by detecting equal accesses and doing disambiguation with
388 runtime alias tests like for
389 .. = a[i];
390 .. = a[i+1];
391 a[i] = ..;
392 a[i+1] = ..;
393 *p = ..;
394 .. = a[i];
395 .. = a[i+1];
396 where we will end up loading { a[i], a[i+1] } once, make
397 sure that inserting group loads before the first load and
398 stores after the last store will do the right thing.
399 Similar for groups like
400 a[i] = ...;
401 ... = a[i];
402 a[i+1] = ...;
403 where loads from the group interleave with the store. */
404 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
405 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
406 {
407 gimple earlier_stmt;
408 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
409 if (DR_IS_WRITE
410 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
411 {
412 if (dump_enabled_p ())
413 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
414 "READ_WRITE dependence in interleaving."
415 "\n");
416 return true;
417 }
418 }
419
420 continue;
421 }
422
423 if (dist > 0 && DDR_REVERSED_P (ddr))
424 {
425 /* If DDR_REVERSED_P the order of the data-refs in DDR was
426 reversed (to make distance vector positive), and the actual
427 distance is negative. */
428 if (dump_enabled_p ())
429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
430 "dependence distance negative.\n");
431 /* Record a negative dependence distance to later limit the
432 amount of stmt copying / unrolling we can perform.
433 Only need to handle read-after-write dependence. */
434 if (DR_IS_READ (drb)
435 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
436 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
437 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
438 continue;
439 }
440
441 if (abs (dist) >= 2
442 && abs (dist) < *max_vf)
443 {
444 /* The dependence distance requires reduction of the maximal
445 vectorization factor. */
446 *max_vf = abs (dist);
447 if (dump_enabled_p ())
448 dump_printf_loc (MSG_NOTE, vect_location,
449 "adjusting maximal vectorization factor to %i\n",
450 *max_vf);
451 }
452
453 if (abs (dist) >= *max_vf)
454 {
455 /* Dependence distance does not create dependence, as far as
456 vectorization is concerned, in this case. */
457 if (dump_enabled_p ())
458 dump_printf_loc (MSG_NOTE, vect_location,
459 "dependence distance >= VF.\n");
460 continue;
461 }
462
463 if (dump_enabled_p ())
464 {
465 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
466 "not vectorized, possible dependence "
467 "between data-refs ");
468 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
469 dump_printf (MSG_NOTE, " and ");
470 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
471 dump_printf (MSG_NOTE, "\n");
472 }
473
474 return true;
475 }
476
477 return false;
478 }
479
480 /* Function vect_analyze_data_ref_dependences.
481
482 Examine all the data references in the loop, and make sure there do not
483 exist any data dependences between them. Set *MAX_VF according to
484 the maximum vectorization factor the data dependences allow. */
485
486 bool
487 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
488 {
489 unsigned int i;
490 struct data_dependence_relation *ddr;
491
492 if (dump_enabled_p ())
493 dump_printf_loc (MSG_NOTE, vect_location,
494 "=== vect_analyze_data_ref_dependences ===\n");
495
496 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
497 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
498 &LOOP_VINFO_DDRS (loop_vinfo),
499 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
500 return false;
501
502 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
503 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
504 return false;
505
506 return true;
507 }
508
509
510 /* Function vect_slp_analyze_data_ref_dependence.
511
512 Return TRUE if there (might) exist a dependence between a memory-reference
513 DRA and a memory-reference DRB. When versioning for alias may check a
514 dependence at run-time, return FALSE. Adjust *MAX_VF according to
515 the data dependence. */
516
517 static bool
518 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
519 {
520 struct data_reference *dra = DDR_A (ddr);
521 struct data_reference *drb = DDR_B (ddr);
522
523 /* We need to check dependences of statements marked as unvectorizable
524 as well, they still can prohibit vectorization. */
525
526 /* Independent data accesses. */
527 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
528 return false;
529
530 if (dra == drb)
531 return false;
532
533 /* Read-read is OK. */
534 if (DR_IS_READ (dra) && DR_IS_READ (drb))
535 return false;
536
537 /* If dra and drb are part of the same interleaving chain consider
538 them independent. */
539 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
540 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
541 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
542 return false;
543
544 /* Unknown data dependence. */
545 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
546 {
547 if (dump_enabled_p ())
548 {
549 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
550 "can't determine dependence between ");
551 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
552 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
553 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
554 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
555 }
556 }
557 else if (dump_enabled_p ())
558 {
559 dump_printf_loc (MSG_NOTE, vect_location,
560 "determined dependence between ");
561 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
562 dump_printf (MSG_NOTE, " and ");
563 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
564 dump_printf (MSG_NOTE, "\n");
565 }
566
567 /* We do not vectorize basic blocks with write-write dependencies. */
568 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
569 return true;
570
571 /* If we have a read-write dependence check that the load is before the store.
572 When we vectorize basic blocks, vector load can be only before
573 corresponding scalar load, and vector store can be only after its
574 corresponding scalar store. So the order of the acceses is preserved in
575 case the load is before the store. */
576 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
577 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
578 {
579 /* That only holds for load-store pairs taking part in vectorization. */
580 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
581 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
582 return false;
583 }
584
585 return true;
586 }
587
588
589 /* Function vect_analyze_data_ref_dependences.
590
591 Examine all the data references in the basic-block, and make sure there
592 do not exist any data dependences between them. Set *MAX_VF according to
593 the maximum vectorization factor the data dependences allow. */
594
595 bool
596 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
597 {
598 struct data_dependence_relation *ddr;
599 unsigned int i;
600
601 if (dump_enabled_p ())
602 dump_printf_loc (MSG_NOTE, vect_location,
603 "=== vect_slp_analyze_data_ref_dependences ===\n");
604
605 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
606 &BB_VINFO_DDRS (bb_vinfo),
607 vNULL, true))
608 return false;
609
610 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
611 if (vect_slp_analyze_data_ref_dependence (ddr))
612 return false;
613
614 return true;
615 }
616
617
618 /* Function vect_compute_data_ref_alignment
619
620 Compute the misalignment of the data reference DR.
621
622 Output:
623 1. If during the misalignment computation it is found that the data reference
624 cannot be vectorized then false is returned.
625 2. DR_MISALIGNMENT (DR) is defined.
626
627 FOR NOW: No analysis is actually performed. Misalignment is calculated
628 only for trivial cases. TODO. */
629
630 static bool
631 vect_compute_data_ref_alignment (struct data_reference *dr)
632 {
633 gimple stmt = DR_STMT (dr);
634 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
636 struct loop *loop = NULL;
637 tree ref = DR_REF (dr);
638 tree vectype;
639 tree base, base_addr;
640 bool base_aligned;
641 tree misalign;
642 tree aligned_to, alignment;
643
644 if (dump_enabled_p ())
645 dump_printf_loc (MSG_NOTE, vect_location,
646 "vect_compute_data_ref_alignment:\n");
647
648 if (loop_vinfo)
649 loop = LOOP_VINFO_LOOP (loop_vinfo);
650
651 /* Initialize misalignment to unknown. */
652 SET_DR_MISALIGNMENT (dr, -1);
653
654 /* Strided loads perform only component accesses, misalignment information
655 is irrelevant for them. */
656 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
657 return true;
658
659 misalign = DR_INIT (dr);
660 aligned_to = DR_ALIGNED_TO (dr);
661 base_addr = DR_BASE_ADDRESS (dr);
662 vectype = STMT_VINFO_VECTYPE (stmt_info);
663
664 /* In case the dataref is in an inner-loop of the loop that is being
665 vectorized (LOOP), we use the base and misalignment information
666 relative to the outer-loop (LOOP). This is ok only if the misalignment
667 stays the same throughout the execution of the inner-loop, which is why
668 we have to check that the stride of the dataref in the inner-loop evenly
669 divides by the vector size. */
670 if (loop && nested_in_vect_loop_p (loop, stmt))
671 {
672 tree step = DR_STEP (dr);
673 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
674
675 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
676 {
677 if (dump_enabled_p ())
678 dump_printf_loc (MSG_NOTE, vect_location,
679 "inner step divides the vector-size.\n");
680 misalign = STMT_VINFO_DR_INIT (stmt_info);
681 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
682 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
683 }
684 else
685 {
686 if (dump_enabled_p ())
687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
688 "inner step doesn't divide the vector-size.\n");
689 misalign = NULL_TREE;
690 }
691 }
692
693 /* Similarly, if we're doing basic-block vectorization, we can only use
694 base and misalignment information relative to an innermost loop if the
695 misalignment stays the same throughout the execution of the loop.
696 As above, this is the case if the stride of the dataref evenly divides
697 by the vector size. */
698 if (!loop)
699 {
700 tree step = DR_STEP (dr);
701 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
702
703 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
704 {
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "SLP: step doesn't divide the vector-size.\n");
708 misalign = NULL_TREE;
709 }
710 }
711
712 base = build_fold_indirect_ref (base_addr);
713 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
714
715 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
716 || !misalign)
717 {
718 if (dump_enabled_p ())
719 {
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown alignment for access: ");
722 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
723 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
724 }
725 return true;
726 }
727
728 if ((DECL_P (base)
729 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
730 alignment) >= 0)
731 || (TREE_CODE (base_addr) == SSA_NAME
732 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
733 TREE_TYPE (base_addr)))),
734 alignment) >= 0)
735 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
736 base_aligned = true;
737 else
738 base_aligned = false;
739
740 if (!base_aligned)
741 {
742 /* Do not change the alignment of global variables here if
743 flag_section_anchors is enabled as we already generated
744 RTL for other functions. Most global variables should
745 have been aligned during the IPA increase_alignment pass. */
746 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
747 || (TREE_STATIC (base) && flag_section_anchors))
748 {
749 if (dump_enabled_p ())
750 {
751 dump_printf_loc (MSG_NOTE, vect_location,
752 "can't force alignment of ref: ");
753 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
754 dump_printf (MSG_NOTE, "\n");
755 }
756 return true;
757 }
758
759 /* Force the alignment of the decl.
760 NOTE: This is the only change to the code we make during
761 the analysis phase, before deciding to vectorize the loop. */
762 if (dump_enabled_p ())
763 {
764 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
765 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
766 dump_printf (MSG_NOTE, "\n");
767 }
768
769 ((dataref_aux *)dr->aux)->base_decl = base;
770 ((dataref_aux *)dr->aux)->base_misaligned = true;
771 }
772
773 /* If this is a backward running DR then first access in the larger
774 vectype actually is N-1 elements before the address in the DR.
775 Adjust misalign accordingly. */
776 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
777 {
778 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
779 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
780 otherwise we wouldn't be here. */
781 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
782 /* PLUS because DR_STEP was negative. */
783 misalign = size_binop (PLUS_EXPR, misalign, offset);
784 }
785
786 /* Modulo alignment. */
787 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
788
789 if (!tree_fits_uhwi_p (misalign))
790 {
791 /* Negative or overflowed misalignment value. */
792 if (dump_enabled_p ())
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "unexpected misalign value\n");
795 return false;
796 }
797
798 SET_DR_MISALIGNMENT (dr, tree_to_uhwi (misalign));
799
800 if (dump_enabled_p ())
801 {
802 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
803 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
804 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
805 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
806 }
807
808 return true;
809 }
810
811
812 /* Function vect_compute_data_refs_alignment
813
814 Compute the misalignment of data references in the loop.
815 Return FALSE if a data reference is found that cannot be vectorized. */
816
817 static bool
818 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
819 bb_vec_info bb_vinfo)
820 {
821 vec<data_reference_p> datarefs;
822 struct data_reference *dr;
823 unsigned int i;
824
825 if (loop_vinfo)
826 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
827 else
828 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
829
830 FOR_EACH_VEC_ELT (datarefs, i, dr)
831 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
832 && !vect_compute_data_ref_alignment (dr))
833 {
834 if (bb_vinfo)
835 {
836 /* Mark unsupported statement as unvectorizable. */
837 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
838 continue;
839 }
840 else
841 return false;
842 }
843
844 return true;
845 }
846
847
848 /* Function vect_update_misalignment_for_peel
849
850 DR - the data reference whose misalignment is to be adjusted.
851 DR_PEEL - the data reference whose misalignment is being made
852 zero in the vector loop by the peel.
853 NPEEL - the number of iterations in the peel loop if the misalignment
854 of DR_PEEL is known at compile time. */
855
856 static void
857 vect_update_misalignment_for_peel (struct data_reference *dr,
858 struct data_reference *dr_peel, int npeel)
859 {
860 unsigned int i;
861 vec<dr_p> same_align_drs;
862 struct data_reference *current_dr;
863 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
864 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
865 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
866 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
867
868 /* For interleaved data accesses the step in the loop must be multiplied by
869 the size of the interleaving group. */
870 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
871 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
872 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
873 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
874
875 /* It can be assumed that the data refs with the same alignment as dr_peel
876 are aligned in the vector loop. */
877 same_align_drs
878 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
879 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
880 {
881 if (current_dr != dr)
882 continue;
883 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
884 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
885 SET_DR_MISALIGNMENT (dr, 0);
886 return;
887 }
888
889 if (known_alignment_for_access_p (dr)
890 && known_alignment_for_access_p (dr_peel))
891 {
892 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
893 int misal = DR_MISALIGNMENT (dr);
894 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
895 misal += negative ? -npeel * dr_size : npeel * dr_size;
896 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
897 SET_DR_MISALIGNMENT (dr, misal);
898 return;
899 }
900
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
903 SET_DR_MISALIGNMENT (dr, -1);
904 }
905
906
907 /* Function vect_verify_datarefs_alignment
908
909 Return TRUE if all data references in the loop can be
910 handled with respect to alignment. */
911
912 bool
913 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
914 {
915 vec<data_reference_p> datarefs;
916 struct data_reference *dr;
917 enum dr_alignment_support supportable_dr_alignment;
918 unsigned int i;
919
920 if (loop_vinfo)
921 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
922 else
923 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
924
925 FOR_EACH_VEC_ELT (datarefs, i, dr)
926 {
927 gimple stmt = DR_STMT (dr);
928 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
929
930 if (!STMT_VINFO_RELEVANT_P (stmt_info))
931 continue;
932
933 /* For interleaving, only the alignment of the first access matters.
934 Skip statements marked as not vectorizable. */
935 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
936 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
937 || !STMT_VINFO_VECTORIZABLE (stmt_info))
938 continue;
939
940 /* Strided loads perform only component accesses, alignment is
941 irrelevant for them. */
942 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
943 continue;
944
945 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
946 if (!supportable_dr_alignment)
947 {
948 if (dump_enabled_p ())
949 {
950 if (DR_IS_READ (dr))
951 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
952 "not vectorized: unsupported unaligned load.");
953 else
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "not vectorized: unsupported unaligned "
956 "store.");
957
958 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
959 DR_REF (dr));
960 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
961 }
962 return false;
963 }
964 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
965 dump_printf_loc (MSG_NOTE, vect_location,
966 "Vectorizing an unaligned access.\n");
967 }
968 return true;
969 }
970
971 /* Given an memory reference EXP return whether its alignment is less
972 than its size. */
973
974 static bool
975 not_size_aligned (tree exp)
976 {
977 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
978 return true;
979
980 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
981 > get_object_alignment (exp));
982 }
983
984 /* Function vector_alignment_reachable_p
985
986 Return true if vector alignment for DR is reachable by peeling
987 a few loop iterations. Return false otherwise. */
988
989 static bool
990 vector_alignment_reachable_p (struct data_reference *dr)
991 {
992 gimple stmt = DR_STMT (dr);
993 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
994 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
995
996 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
997 {
998 /* For interleaved access we peel only if number of iterations in
999 the prolog loop ({VF - misalignment}), is a multiple of the
1000 number of the interleaved accesses. */
1001 int elem_size, mis_in_elements;
1002 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1003
1004 /* FORNOW: handle only known alignment. */
1005 if (!known_alignment_for_access_p (dr))
1006 return false;
1007
1008 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1009 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1010
1011 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1012 return false;
1013 }
1014
1015 /* If misalignment is known at the compile time then allow peeling
1016 only if natural alignment is reachable through peeling. */
1017 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1018 {
1019 HOST_WIDE_INT elmsize =
1020 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1021 if (dump_enabled_p ())
1022 {
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1025 dump_printf (MSG_NOTE,
1026 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1027 }
1028 if (DR_MISALIGNMENT (dr) % elmsize)
1029 {
1030 if (dump_enabled_p ())
1031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1032 "data size does not divide the misalignment.\n");
1033 return false;
1034 }
1035 }
1036
1037 if (!known_alignment_for_access_p (dr))
1038 {
1039 tree type = TREE_TYPE (DR_REF (dr));
1040 bool is_packed = not_size_aligned (DR_REF (dr));
1041 if (dump_enabled_p ())
1042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1043 "Unknown misalignment, is_packed = %d\n",is_packed);
1044 if ((TYPE_USER_ALIGN (type) && !is_packed)
1045 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1046 return true;
1047 else
1048 return false;
1049 }
1050
1051 return true;
1052 }
1053
1054
1055 /* Calculate the cost of the memory access represented by DR. */
1056
1057 static void
1058 vect_get_data_access_cost (struct data_reference *dr,
1059 unsigned int *inside_cost,
1060 unsigned int *outside_cost,
1061 stmt_vector_for_cost *body_cost_vec)
1062 {
1063 gimple stmt = DR_STMT (dr);
1064 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1065 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1066 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1067 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1068 int ncopies = vf / nunits;
1069
1070 if (DR_IS_READ (dr))
1071 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1072 NULL, body_cost_vec, false);
1073 else
1074 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1075
1076 if (dump_enabled_p ())
1077 dump_printf_loc (MSG_NOTE, vect_location,
1078 "vect_get_data_access_cost: inside_cost = %d, "
1079 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1080 }
1081
1082
1083 /* Insert DR into peeling hash table with NPEEL as key. */
1084
1085 static void
1086 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1087 int npeel)
1088 {
1089 struct _vect_peel_info elem, *slot;
1090 _vect_peel_info **new_slot;
1091 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1092
1093 elem.npeel = npeel;
1094 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1095 if (slot)
1096 slot->count++;
1097 else
1098 {
1099 slot = XNEW (struct _vect_peel_info);
1100 slot->npeel = npeel;
1101 slot->dr = dr;
1102 slot->count = 1;
1103 new_slot
1104 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1105 *new_slot = slot;
1106 }
1107
1108 if (!supportable_dr_alignment
1109 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1110 slot->count += VECT_MAX_COST;
1111 }
1112
1113
1114 /* Traverse peeling hash table to find peeling option that aligns maximum
1115 number of data accesses. */
1116
1117 int
1118 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1119 _vect_peel_extended_info *max)
1120 {
1121 vect_peel_info elem = *slot;
1122
1123 if (elem->count > max->peel_info.count
1124 || (elem->count == max->peel_info.count
1125 && max->peel_info.npeel > elem->npeel))
1126 {
1127 max->peel_info.npeel = elem->npeel;
1128 max->peel_info.count = elem->count;
1129 max->peel_info.dr = elem->dr;
1130 }
1131
1132 return 1;
1133 }
1134
1135
1136 /* Traverse peeling hash table and calculate cost for each peeling option.
1137 Find the one with the lowest cost. */
1138
1139 int
1140 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1141 _vect_peel_extended_info *min)
1142 {
1143 vect_peel_info elem = *slot;
1144 int save_misalignment, dummy;
1145 unsigned int inside_cost = 0, outside_cost = 0, i;
1146 gimple stmt = DR_STMT (elem->dr);
1147 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1148 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1149 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1150 struct data_reference *dr;
1151 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1152 int single_iter_cost;
1153
1154 prologue_cost_vec.create (2);
1155 body_cost_vec.create (2);
1156 epilogue_cost_vec.create (2);
1157
1158 FOR_EACH_VEC_ELT (datarefs, i, dr)
1159 {
1160 stmt = DR_STMT (dr);
1161 stmt_info = vinfo_for_stmt (stmt);
1162 /* For interleaving, only the alignment of the first access
1163 matters. */
1164 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1165 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1166 continue;
1167
1168 save_misalignment = DR_MISALIGNMENT (dr);
1169 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1170 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1171 &body_cost_vec);
1172 SET_DR_MISALIGNMENT (dr, save_misalignment);
1173 }
1174
1175 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1176 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1177 &dummy, single_iter_cost,
1178 &prologue_cost_vec,
1179 &epilogue_cost_vec);
1180
1181 /* Prologue and epilogue costs are added to the target model later.
1182 These costs depend only on the scalar iteration cost, the
1183 number of peeling iterations finally chosen, and the number of
1184 misaligned statements. So discard the information found here. */
1185 prologue_cost_vec.release ();
1186 epilogue_cost_vec.release ();
1187
1188 if (inside_cost < min->inside_cost
1189 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1190 {
1191 min->inside_cost = inside_cost;
1192 min->outside_cost = outside_cost;
1193 min->body_cost_vec.release ();
1194 min->body_cost_vec = body_cost_vec;
1195 min->peel_info.dr = elem->dr;
1196 min->peel_info.npeel = elem->npeel;
1197 }
1198 else
1199 body_cost_vec.release ();
1200
1201 return 1;
1202 }
1203
1204
1205 /* Choose best peeling option by traversing peeling hash table and either
1206 choosing an option with the lowest cost (if cost model is enabled) or the
1207 option that aligns as many accesses as possible. */
1208
1209 static struct data_reference *
1210 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1211 unsigned int *npeel,
1212 stmt_vector_for_cost *body_cost_vec)
1213 {
1214 struct _vect_peel_extended_info res;
1215
1216 res.peel_info.dr = NULL;
1217 res.body_cost_vec = stmt_vector_for_cost ();
1218
1219 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1220 {
1221 res.inside_cost = INT_MAX;
1222 res.outside_cost = INT_MAX;
1223 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1224 ->traverse <_vect_peel_extended_info *,
1225 vect_peeling_hash_get_lowest_cost> (&res);
1226 }
1227 else
1228 {
1229 res.peel_info.count = 0;
1230 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1231 ->traverse <_vect_peel_extended_info *,
1232 vect_peeling_hash_get_most_frequent> (&res);
1233 }
1234
1235 *npeel = res.peel_info.npeel;
1236 *body_cost_vec = res.body_cost_vec;
1237 return res.peel_info.dr;
1238 }
1239
1240
1241 /* Function vect_enhance_data_refs_alignment
1242
1243 This pass will use loop versioning and loop peeling in order to enhance
1244 the alignment of data references in the loop.
1245
1246 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1247 original loop is to be vectorized. Any other loops that are created by
1248 the transformations performed in this pass - are not supposed to be
1249 vectorized. This restriction will be relaxed.
1250
1251 This pass will require a cost model to guide it whether to apply peeling
1252 or versioning or a combination of the two. For example, the scheme that
1253 intel uses when given a loop with several memory accesses, is as follows:
1254 choose one memory access ('p') which alignment you want to force by doing
1255 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1256 other accesses are not necessarily aligned, or (2) use loop versioning to
1257 generate one loop in which all accesses are aligned, and another loop in
1258 which only 'p' is necessarily aligned.
1259
1260 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1261 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1262 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1263
1264 Devising a cost model is the most critical aspect of this work. It will
1265 guide us on which access to peel for, whether to use loop versioning, how
1266 many versions to create, etc. The cost model will probably consist of
1267 generic considerations as well as target specific considerations (on
1268 powerpc for example, misaligned stores are more painful than misaligned
1269 loads).
1270
1271 Here are the general steps involved in alignment enhancements:
1272
1273 -- original loop, before alignment analysis:
1274 for (i=0; i<N; i++){
1275 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1276 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1277 }
1278
1279 -- After vect_compute_data_refs_alignment:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = 3
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 }
1284
1285 -- Possibility 1: we do loop versioning:
1286 if (p is aligned) {
1287 for (i=0; i<N; i++){ # loop 1A
1288 x = q[i]; # DR_MISALIGNMENT(q) = 3
1289 p[i] = y; # DR_MISALIGNMENT(p) = 0
1290 }
1291 }
1292 else {
1293 for (i=0; i<N; i++){ # loop 1B
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1296 }
1297 }
1298
1299 -- Possibility 2: we do loop peeling:
1300 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1301 x = q[i];
1302 p[i] = y;
1303 }
1304 for (i = 3; i < N; i++){ # loop 2A
1305 x = q[i]; # DR_MISALIGNMENT(q) = 0
1306 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1307 }
1308
1309 -- Possibility 3: combination of loop peeling and versioning:
1310 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1311 x = q[i];
1312 p[i] = y;
1313 }
1314 if (p is aligned) {
1315 for (i = 3; i<N; i++){ # loop 3A
1316 x = q[i]; # DR_MISALIGNMENT(q) = 0
1317 p[i] = y; # DR_MISALIGNMENT(p) = 0
1318 }
1319 }
1320 else {
1321 for (i = 3; i<N; i++){ # loop 3B
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1324 }
1325 }
1326
1327 These loops are later passed to loop_transform to be vectorized. The
1328 vectorizer will use the alignment information to guide the transformation
1329 (whether to generate regular loads/stores, or with special handling for
1330 misalignment). */
1331
1332 bool
1333 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1334 {
1335 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1336 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1337 enum dr_alignment_support supportable_dr_alignment;
1338 struct data_reference *dr0 = NULL, *first_store = NULL;
1339 struct data_reference *dr;
1340 unsigned int i, j;
1341 bool do_peeling = false;
1342 bool do_versioning = false;
1343 bool stat;
1344 gimple stmt;
1345 stmt_vec_info stmt_info;
1346 unsigned int npeel = 0;
1347 bool all_misalignments_unknown = true;
1348 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1349 unsigned possible_npeel_number = 1;
1350 tree vectype;
1351 unsigned int nelements, mis, same_align_drs_max = 0;
1352 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1353
1354 if (dump_enabled_p ())
1355 dump_printf_loc (MSG_NOTE, vect_location,
1356 "=== vect_enhance_data_refs_alignment ===\n");
1357
1358 /* While cost model enhancements are expected in the future, the high level
1359 view of the code at this time is as follows:
1360
1361 A) If there is a misaligned access then see if peeling to align
1362 this access can make all data references satisfy
1363 vect_supportable_dr_alignment. If so, update data structures
1364 as needed and return true.
1365
1366 B) If peeling wasn't possible and there is a data reference with an
1367 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1368 then see if loop versioning checks can be used to make all data
1369 references satisfy vect_supportable_dr_alignment. If so, update
1370 data structures as needed and return true.
1371
1372 C) If neither peeling nor versioning were successful then return false if
1373 any data reference does not satisfy vect_supportable_dr_alignment.
1374
1375 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1376
1377 Note, Possibility 3 above (which is peeling and versioning together) is not
1378 being done at this time. */
1379
1380 /* (1) Peeling to force alignment. */
1381
1382 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1383 Considerations:
1384 + How many accesses will become aligned due to the peeling
1385 - How many accesses will become unaligned due to the peeling,
1386 and the cost of misaligned accesses.
1387 - The cost of peeling (the extra runtime checks, the increase
1388 in code size). */
1389
1390 FOR_EACH_VEC_ELT (datarefs, i, dr)
1391 {
1392 stmt = DR_STMT (dr);
1393 stmt_info = vinfo_for_stmt (stmt);
1394
1395 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1396 continue;
1397
1398 /* For interleaving, only the alignment of the first access
1399 matters. */
1400 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1401 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1402 continue;
1403
1404 /* For invariant accesses there is nothing to enhance. */
1405 if (integer_zerop (DR_STEP (dr)))
1406 continue;
1407
1408 /* Strided loads perform only component accesses, alignment is
1409 irrelevant for them. */
1410 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1411 continue;
1412
1413 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1414 do_peeling = vector_alignment_reachable_p (dr);
1415 if (do_peeling)
1416 {
1417 if (known_alignment_for_access_p (dr))
1418 {
1419 unsigned int npeel_tmp;
1420 bool negative = tree_int_cst_compare (DR_STEP (dr),
1421 size_zero_node) < 0;
1422
1423 /* Save info about DR in the hash table. */
1424 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1425 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1426 = new hash_table<peel_info_hasher> (1);
1427
1428 vectype = STMT_VINFO_VECTYPE (stmt_info);
1429 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1430 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1431 TREE_TYPE (DR_REF (dr))));
1432 npeel_tmp = (negative
1433 ? (mis - nelements) : (nelements - mis))
1434 & (nelements - 1);
1435
1436 /* For multiple types, it is possible that the bigger type access
1437 will have more than one peeling option. E.g., a loop with two
1438 types: one of size (vector size / 4), and the other one of
1439 size (vector size / 8). Vectorization factor will 8. If both
1440 access are misaligned by 3, the first one needs one scalar
1441 iteration to be aligned, and the second one needs 5. But the
1442 the first one will be aligned also by peeling 5 scalar
1443 iterations, and in that case both accesses will be aligned.
1444 Hence, except for the immediate peeling amount, we also want
1445 to try to add full vector size, while we don't exceed
1446 vectorization factor.
1447 We do this automtically for cost model, since we calculate cost
1448 for every peeling option. */
1449 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1450 possible_npeel_number = vf /nelements;
1451
1452 /* Handle the aligned case. We may decide to align some other
1453 access, making DR unaligned. */
1454 if (DR_MISALIGNMENT (dr) == 0)
1455 {
1456 npeel_tmp = 0;
1457 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1458 possible_npeel_number++;
1459 }
1460
1461 for (j = 0; j < possible_npeel_number; j++)
1462 {
1463 gcc_assert (npeel_tmp <= vf);
1464 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1465 npeel_tmp += nelements;
1466 }
1467
1468 all_misalignments_unknown = false;
1469 /* Data-ref that was chosen for the case that all the
1470 misalignments are unknown is not relevant anymore, since we
1471 have a data-ref with known alignment. */
1472 dr0 = NULL;
1473 }
1474 else
1475 {
1476 /* If we don't know any misalignment values, we prefer
1477 peeling for data-ref that has the maximum number of data-refs
1478 with the same alignment, unless the target prefers to align
1479 stores over load. */
1480 if (all_misalignments_unknown)
1481 {
1482 unsigned same_align_drs
1483 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1484 if (!dr0
1485 || same_align_drs_max < same_align_drs)
1486 {
1487 same_align_drs_max = same_align_drs;
1488 dr0 = dr;
1489 }
1490 /* For data-refs with the same number of related
1491 accesses prefer the one where the misalign
1492 computation will be invariant in the outermost loop. */
1493 else if (same_align_drs_max == same_align_drs)
1494 {
1495 struct loop *ivloop0, *ivloop;
1496 ivloop0 = outermost_invariant_loop_for_expr
1497 (loop, DR_BASE_ADDRESS (dr0));
1498 ivloop = outermost_invariant_loop_for_expr
1499 (loop, DR_BASE_ADDRESS (dr));
1500 if ((ivloop && !ivloop0)
1501 || (ivloop && ivloop0
1502 && flow_loop_nested_p (ivloop, ivloop0)))
1503 dr0 = dr;
1504 }
1505
1506 if (!first_store && DR_IS_WRITE (dr))
1507 first_store = dr;
1508 }
1509
1510 /* If there are both known and unknown misaligned accesses in the
1511 loop, we choose peeling amount according to the known
1512 accesses. */
1513 if (!supportable_dr_alignment)
1514 {
1515 dr0 = dr;
1516 if (!first_store && DR_IS_WRITE (dr))
1517 first_store = dr;
1518 }
1519 }
1520 }
1521 else
1522 {
1523 if (!aligned_access_p (dr))
1524 {
1525 if (dump_enabled_p ())
1526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1527 "vector alignment may not be reachable\n");
1528 break;
1529 }
1530 }
1531 }
1532
1533 /* Check if we can possibly peel the loop. */
1534 if (!vect_can_advance_ivs_p (loop_vinfo)
1535 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1536 do_peeling = false;
1537
1538 /* If we don't know how many times the peeling loop will run
1539 assume it will run VF-1 times and disable peeling if the remaining
1540 iters are less than the vectorization factor. */
1541 if (do_peeling
1542 && all_misalignments_unknown
1543 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1544 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1545 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1546 do_peeling = false;
1547
1548 if (do_peeling
1549 && all_misalignments_unknown
1550 && vect_supportable_dr_alignment (dr0, false))
1551 {
1552 /* Check if the target requires to prefer stores over loads, i.e., if
1553 misaligned stores are more expensive than misaligned loads (taking
1554 drs with same alignment into account). */
1555 if (first_store && DR_IS_READ (dr0))
1556 {
1557 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1558 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1559 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1560 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1561 stmt_vector_for_cost dummy;
1562 dummy.create (2);
1563
1564 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1565 &dummy);
1566 vect_get_data_access_cost (first_store, &store_inside_cost,
1567 &store_outside_cost, &dummy);
1568
1569 dummy.release ();
1570
1571 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1572 aligning the load DR0). */
1573 load_inside_penalty = store_inside_cost;
1574 load_outside_penalty = store_outside_cost;
1575 for (i = 0;
1576 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1577 DR_STMT (first_store))).iterate (i, &dr);
1578 i++)
1579 if (DR_IS_READ (dr))
1580 {
1581 load_inside_penalty += load_inside_cost;
1582 load_outside_penalty += load_outside_cost;
1583 }
1584 else
1585 {
1586 load_inside_penalty += store_inside_cost;
1587 load_outside_penalty += store_outside_cost;
1588 }
1589
1590 /* Calculate the penalty for leaving DR0 unaligned (by
1591 aligning the FIRST_STORE). */
1592 store_inside_penalty = load_inside_cost;
1593 store_outside_penalty = load_outside_cost;
1594 for (i = 0;
1595 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1596 DR_STMT (dr0))).iterate (i, &dr);
1597 i++)
1598 if (DR_IS_READ (dr))
1599 {
1600 store_inside_penalty += load_inside_cost;
1601 store_outside_penalty += load_outside_cost;
1602 }
1603 else
1604 {
1605 store_inside_penalty += store_inside_cost;
1606 store_outside_penalty += store_outside_cost;
1607 }
1608
1609 if (load_inside_penalty > store_inside_penalty
1610 || (load_inside_penalty == store_inside_penalty
1611 && load_outside_penalty > store_outside_penalty))
1612 dr0 = first_store;
1613 }
1614
1615 /* In case there are only loads with different unknown misalignments, use
1616 peeling only if it may help to align other accesses in the loop. */
1617 if (!first_store
1618 && !STMT_VINFO_SAME_ALIGN_REFS (
1619 vinfo_for_stmt (DR_STMT (dr0))).length ()
1620 && vect_supportable_dr_alignment (dr0, false)
1621 != dr_unaligned_supported)
1622 do_peeling = false;
1623 }
1624
1625 if (do_peeling && !dr0)
1626 {
1627 /* Peeling is possible, but there is no data access that is not supported
1628 unless aligned. So we try to choose the best possible peeling. */
1629
1630 /* We should get here only if there are drs with known misalignment. */
1631 gcc_assert (!all_misalignments_unknown);
1632
1633 /* Choose the best peeling from the hash table. */
1634 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1635 &body_cost_vec);
1636 if (!dr0 || !npeel)
1637 do_peeling = false;
1638
1639 /* If peeling by npeel will result in a remaining loop not iterating
1640 enough to be vectorized then do not peel. */
1641 if (do_peeling
1642 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1643 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1644 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1645 do_peeling = false;
1646 }
1647
1648 if (do_peeling)
1649 {
1650 stmt = DR_STMT (dr0);
1651 stmt_info = vinfo_for_stmt (stmt);
1652 vectype = STMT_VINFO_VECTYPE (stmt_info);
1653 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1654
1655 if (known_alignment_for_access_p (dr0))
1656 {
1657 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1658 size_zero_node) < 0;
1659 if (!npeel)
1660 {
1661 /* Since it's known at compile time, compute the number of
1662 iterations in the peeled loop (the peeling factor) for use in
1663 updating DR_MISALIGNMENT values. The peeling factor is the
1664 vectorization factor minus the misalignment as an element
1665 count. */
1666 mis = DR_MISALIGNMENT (dr0);
1667 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1668 npeel = ((negative ? mis - nelements : nelements - mis)
1669 & (nelements - 1));
1670 }
1671
1672 /* For interleaved data access every iteration accesses all the
1673 members of the group, therefore we divide the number of iterations
1674 by the group size. */
1675 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1676 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1677 npeel /= GROUP_SIZE (stmt_info);
1678
1679 if (dump_enabled_p ())
1680 dump_printf_loc (MSG_NOTE, vect_location,
1681 "Try peeling by %d\n", npeel);
1682 }
1683
1684 /* Ensure that all data refs can be vectorized after the peel. */
1685 FOR_EACH_VEC_ELT (datarefs, i, dr)
1686 {
1687 int save_misalignment;
1688
1689 if (dr == dr0)
1690 continue;
1691
1692 stmt = DR_STMT (dr);
1693 stmt_info = vinfo_for_stmt (stmt);
1694 /* For interleaving, only the alignment of the first access
1695 matters. */
1696 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1697 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1698 continue;
1699
1700 /* Strided loads perform only component accesses, alignment is
1701 irrelevant for them. */
1702 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1703 continue;
1704
1705 save_misalignment = DR_MISALIGNMENT (dr);
1706 vect_update_misalignment_for_peel (dr, dr0, npeel);
1707 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1708 SET_DR_MISALIGNMENT (dr, save_misalignment);
1709
1710 if (!supportable_dr_alignment)
1711 {
1712 do_peeling = false;
1713 break;
1714 }
1715 }
1716
1717 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1718 {
1719 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1720 if (!stat)
1721 do_peeling = false;
1722 else
1723 {
1724 body_cost_vec.release ();
1725 return stat;
1726 }
1727 }
1728
1729 if (do_peeling)
1730 {
1731 unsigned max_allowed_peel
1732 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1733 if (max_allowed_peel != (unsigned)-1)
1734 {
1735 unsigned max_peel = npeel;
1736 if (max_peel == 0)
1737 {
1738 gimple dr_stmt = DR_STMT (dr0);
1739 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1740 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1741 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1742 }
1743 if (max_peel > max_allowed_peel)
1744 {
1745 do_peeling = false;
1746 if (dump_enabled_p ())
1747 dump_printf_loc (MSG_NOTE, vect_location,
1748 "Disable peeling, max peels reached: %d\n", max_peel);
1749 }
1750 }
1751 }
1752
1753 if (do_peeling)
1754 {
1755 stmt_info_for_cost *si;
1756 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1757
1758 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1759 If the misalignment of DR_i is identical to that of dr0 then set
1760 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1761 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1762 by the peeling factor times the element size of DR_i (MOD the
1763 vectorization factor times the size). Otherwise, the
1764 misalignment of DR_i must be set to unknown. */
1765 FOR_EACH_VEC_ELT (datarefs, i, dr)
1766 if (dr != dr0)
1767 vect_update_misalignment_for_peel (dr, dr0, npeel);
1768
1769 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1770 if (npeel)
1771 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1772 else
1773 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1774 = DR_MISALIGNMENT (dr0);
1775 SET_DR_MISALIGNMENT (dr0, 0);
1776 if (dump_enabled_p ())
1777 {
1778 dump_printf_loc (MSG_NOTE, vect_location,
1779 "Alignment of access forced using peeling.\n");
1780 dump_printf_loc (MSG_NOTE, vect_location,
1781 "Peeling for alignment will be applied.\n");
1782 }
1783 /* We've delayed passing the inside-loop peeling costs to the
1784 target cost model until we were sure peeling would happen.
1785 Do so now. */
1786 if (body_cost_vec.exists ())
1787 {
1788 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1789 {
1790 struct _stmt_vec_info *stmt_info
1791 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1792 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1793 si->misalign, vect_body);
1794 }
1795 body_cost_vec.release ();
1796 }
1797
1798 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1799 gcc_assert (stat);
1800 return stat;
1801 }
1802 }
1803
1804 body_cost_vec.release ();
1805
1806 /* (2) Versioning to force alignment. */
1807
1808 /* Try versioning if:
1809 1) optimize loop for speed
1810 2) there is at least one unsupported misaligned data ref with an unknown
1811 misalignment, and
1812 3) all misaligned data refs with a known misalignment are supported, and
1813 4) the number of runtime alignment checks is within reason. */
1814
1815 do_versioning =
1816 optimize_loop_nest_for_speed_p (loop)
1817 && (!loop->inner); /* FORNOW */
1818
1819 if (do_versioning)
1820 {
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 {
1823 stmt = DR_STMT (dr);
1824 stmt_info = vinfo_for_stmt (stmt);
1825
1826 /* For interleaving, only the alignment of the first access
1827 matters. */
1828 if (aligned_access_p (dr)
1829 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1830 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1831 continue;
1832
1833 /* Strided loads perform only component accesses, alignment is
1834 irrelevant for them. */
1835 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1836 continue;
1837
1838 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1839
1840 if (!supportable_dr_alignment)
1841 {
1842 gimple stmt;
1843 int mask;
1844 tree vectype;
1845
1846 if (known_alignment_for_access_p (dr)
1847 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1848 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1849 {
1850 do_versioning = false;
1851 break;
1852 }
1853
1854 stmt = DR_STMT (dr);
1855 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1856 gcc_assert (vectype);
1857
1858 /* The rightmost bits of an aligned address must be zeros.
1859 Construct the mask needed for this test. For example,
1860 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1861 mask must be 15 = 0xf. */
1862 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1863
1864 /* FORNOW: use the same mask to test all potentially unaligned
1865 references in the loop. The vectorizer currently supports
1866 a single vector size, see the reference to
1867 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1868 vectorization factor is computed. */
1869 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1870 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1871 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1872 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1873 DR_STMT (dr));
1874 }
1875 }
1876
1877 /* Versioning requires at least one misaligned data reference. */
1878 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1879 do_versioning = false;
1880 else if (!do_versioning)
1881 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1882 }
1883
1884 if (do_versioning)
1885 {
1886 vec<gimple> may_misalign_stmts
1887 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1888 gimple stmt;
1889
1890 /* It can now be assumed that the data references in the statements
1891 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1892 of the loop being vectorized. */
1893 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1894 {
1895 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1896 dr = STMT_VINFO_DATA_REF (stmt_info);
1897 SET_DR_MISALIGNMENT (dr, 0);
1898 if (dump_enabled_p ())
1899 dump_printf_loc (MSG_NOTE, vect_location,
1900 "Alignment of access forced using versioning.\n");
1901 }
1902
1903 if (dump_enabled_p ())
1904 dump_printf_loc (MSG_NOTE, vect_location,
1905 "Versioning for alignment will be applied.\n");
1906
1907 /* Peeling and versioning can't be done together at this time. */
1908 gcc_assert (! (do_peeling && do_versioning));
1909
1910 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1911 gcc_assert (stat);
1912 return stat;
1913 }
1914
1915 /* This point is reached if neither peeling nor versioning is being done. */
1916 gcc_assert (! (do_peeling || do_versioning));
1917
1918 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1919 return stat;
1920 }
1921
1922
1923 /* Function vect_find_same_alignment_drs.
1924
1925 Update group and alignment relations according to the chosen
1926 vectorization factor. */
1927
1928 static void
1929 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1930 loop_vec_info loop_vinfo)
1931 {
1932 unsigned int i;
1933 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1934 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1935 struct data_reference *dra = DDR_A (ddr);
1936 struct data_reference *drb = DDR_B (ddr);
1937 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1938 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1939 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1940 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1941 lambda_vector dist_v;
1942 unsigned int loop_depth;
1943
1944 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1945 return;
1946
1947 if (dra == drb)
1948 return;
1949
1950 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1951 return;
1952
1953 /* Loop-based vectorization and known data dependence. */
1954 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1955 return;
1956
1957 /* Data-dependence analysis reports a distance vector of zero
1958 for data-references that overlap only in the first iteration
1959 but have different sign step (see PR45764).
1960 So as a sanity check require equal DR_STEP. */
1961 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1962 return;
1963
1964 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1965 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1966 {
1967 int dist = dist_v[loop_depth];
1968
1969 if (dump_enabled_p ())
1970 dump_printf_loc (MSG_NOTE, vect_location,
1971 "dependence distance = %d.\n", dist);
1972
1973 /* Same loop iteration. */
1974 if (dist == 0
1975 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1976 {
1977 /* Two references with distance zero have the same alignment. */
1978 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1979 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1980 if (dump_enabled_p ())
1981 {
1982 dump_printf_loc (MSG_NOTE, vect_location,
1983 "accesses have the same alignment.\n");
1984 dump_printf (MSG_NOTE,
1985 "dependence distance modulo vf == 0 between ");
1986 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1987 dump_printf (MSG_NOTE, " and ");
1988 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1989 dump_printf (MSG_NOTE, "\n");
1990 }
1991 }
1992 }
1993 }
1994
1995
1996 /* Function vect_analyze_data_refs_alignment
1997
1998 Analyze the alignment of the data-references in the loop.
1999 Return FALSE if a data reference is found that cannot be vectorized. */
2000
2001 bool
2002 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2003 bb_vec_info bb_vinfo)
2004 {
2005 if (dump_enabled_p ())
2006 dump_printf_loc (MSG_NOTE, vect_location,
2007 "=== vect_analyze_data_refs_alignment ===\n");
2008
2009 /* Mark groups of data references with same alignment using
2010 data dependence information. */
2011 if (loop_vinfo)
2012 {
2013 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2014 struct data_dependence_relation *ddr;
2015 unsigned int i;
2016
2017 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2018 vect_find_same_alignment_drs (ddr, loop_vinfo);
2019 }
2020
2021 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2022 {
2023 if (dump_enabled_p ())
2024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025 "not vectorized: can't calculate alignment "
2026 "for data ref.\n");
2027 return false;
2028 }
2029
2030 return true;
2031 }
2032
2033
2034 /* Analyze groups of accesses: check that DR belongs to a group of
2035 accesses of legal size, step, etc. Detect gaps, single element
2036 interleaving, and other special cases. Set grouped access info.
2037 Collect groups of strided stores for further use in SLP analysis. */
2038
2039 static bool
2040 vect_analyze_group_access (struct data_reference *dr)
2041 {
2042 tree step = DR_STEP (dr);
2043 tree scalar_type = TREE_TYPE (DR_REF (dr));
2044 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2045 gimple stmt = DR_STMT (dr);
2046 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2047 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2048 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2049 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2050 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2051 bool slp_impossible = false;
2052 struct loop *loop = NULL;
2053
2054 if (loop_vinfo)
2055 loop = LOOP_VINFO_LOOP (loop_vinfo);
2056
2057 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2058 size of the interleaving group (including gaps). */
2059 groupsize = absu_hwi (dr_step) / type_size;
2060
2061 /* Not consecutive access is possible only if it is a part of interleaving. */
2062 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2063 {
2064 /* Check if it this DR is a part of interleaving, and is a single
2065 element of the group that is accessed in the loop. */
2066
2067 /* Gaps are supported only for loads. STEP must be a multiple of the type
2068 size. The size of the group must be a power of 2. */
2069 if (DR_IS_READ (dr)
2070 && (dr_step % type_size) == 0
2071 && groupsize > 0
2072 && exact_log2 (groupsize) != -1)
2073 {
2074 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2075 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2076 if (dump_enabled_p ())
2077 {
2078 dump_printf_loc (MSG_NOTE, vect_location,
2079 "Detected single element interleaving ");
2080 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2081 dump_printf (MSG_NOTE, " step ");
2082 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2083 dump_printf (MSG_NOTE, "\n");
2084 }
2085
2086 if (loop_vinfo)
2087 {
2088 if (dump_enabled_p ())
2089 dump_printf_loc (MSG_NOTE, vect_location,
2090 "Data access with gaps requires scalar "
2091 "epilogue loop\n");
2092 if (loop->inner)
2093 {
2094 if (dump_enabled_p ())
2095 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096 "Peeling for outer loop is not"
2097 " supported\n");
2098 return false;
2099 }
2100
2101 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2102 }
2103
2104 return true;
2105 }
2106
2107 if (dump_enabled_p ())
2108 {
2109 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2110 "not consecutive access ");
2111 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2112 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2113 }
2114
2115 if (bb_vinfo)
2116 {
2117 /* Mark the statement as unvectorizable. */
2118 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2119 return true;
2120 }
2121
2122 return false;
2123 }
2124
2125 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2126 {
2127 /* First stmt in the interleaving chain. Check the chain. */
2128 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2129 struct data_reference *data_ref = dr;
2130 unsigned int count = 1;
2131 tree prev_init = DR_INIT (data_ref);
2132 gimple prev = stmt;
2133 HOST_WIDE_INT diff, gaps = 0;
2134 unsigned HOST_WIDE_INT count_in_bytes;
2135
2136 while (next)
2137 {
2138 /* Skip same data-refs. In case that two or more stmts share
2139 data-ref (supported only for loads), we vectorize only the first
2140 stmt, and the rest get their vectorized loads from the first
2141 one. */
2142 if (!tree_int_cst_compare (DR_INIT (data_ref),
2143 DR_INIT (STMT_VINFO_DATA_REF (
2144 vinfo_for_stmt (next)))))
2145 {
2146 if (DR_IS_WRITE (data_ref))
2147 {
2148 if (dump_enabled_p ())
2149 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2150 "Two store stmts share the same dr.\n");
2151 return false;
2152 }
2153
2154 /* For load use the same data-ref load. */
2155 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2156
2157 prev = next;
2158 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2159 continue;
2160 }
2161
2162 prev = next;
2163 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2164
2165 /* All group members have the same STEP by construction. */
2166 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2167
2168 /* Check that the distance between two accesses is equal to the type
2169 size. Otherwise, we have gaps. */
2170 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2171 - TREE_INT_CST_LOW (prev_init)) / type_size;
2172 if (diff != 1)
2173 {
2174 /* FORNOW: SLP of accesses with gaps is not supported. */
2175 slp_impossible = true;
2176 if (DR_IS_WRITE (data_ref))
2177 {
2178 if (dump_enabled_p ())
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 "interleaved store with gaps\n");
2181 return false;
2182 }
2183
2184 gaps += diff - 1;
2185 }
2186
2187 last_accessed_element += diff;
2188
2189 /* Store the gap from the previous member of the group. If there is no
2190 gap in the access, GROUP_GAP is always 1. */
2191 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2192
2193 prev_init = DR_INIT (data_ref);
2194 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2195 /* Count the number of data-refs in the chain. */
2196 count++;
2197 }
2198
2199 /* COUNT is the number of accesses found, we multiply it by the size of
2200 the type to get COUNT_IN_BYTES. */
2201 count_in_bytes = type_size * count;
2202
2203 /* Check that the size of the interleaving (including gaps) is not
2204 greater than STEP. */
2205 if (dr_step != 0
2206 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2207 {
2208 if (dump_enabled_p ())
2209 {
2210 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2211 "interleaving size is greater than step for ");
2212 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2213 DR_REF (dr));
2214 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2215 }
2216 return false;
2217 }
2218
2219 /* Check that the size of the interleaving is equal to STEP for stores,
2220 i.e., that there are no gaps. */
2221 if (dr_step != 0
2222 && absu_hwi (dr_step) != count_in_bytes)
2223 {
2224 if (DR_IS_READ (dr))
2225 {
2226 slp_impossible = true;
2227 /* There is a gap after the last load in the group. This gap is a
2228 difference between the groupsize and the number of elements.
2229 When there is no gap, this difference should be 0. */
2230 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2231 }
2232 else
2233 {
2234 if (dump_enabled_p ())
2235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236 "interleaved store with gaps\n");
2237 return false;
2238 }
2239 }
2240
2241 /* Check that STEP is a multiple of type size. */
2242 if (dr_step != 0
2243 && (dr_step % type_size) != 0)
2244 {
2245 if (dump_enabled_p ())
2246 {
2247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2248 "step is not a multiple of type size: step ");
2249 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2250 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2251 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2252 TYPE_SIZE_UNIT (scalar_type));
2253 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2254 }
2255 return false;
2256 }
2257
2258 if (groupsize == 0)
2259 groupsize = count;
2260
2261 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2262 if (dump_enabled_p ())
2263 dump_printf_loc (MSG_NOTE, vect_location,
2264 "Detected interleaving of size %d\n", (int)groupsize);
2265
2266 /* SLP: create an SLP data structure for every interleaving group of
2267 stores for further analysis in vect_analyse_slp. */
2268 if (DR_IS_WRITE (dr) && !slp_impossible)
2269 {
2270 if (loop_vinfo)
2271 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2272 if (bb_vinfo)
2273 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2274 }
2275
2276 /* There is a gap in the end of the group. */
2277 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2278 {
2279 if (dump_enabled_p ())
2280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2281 "Data access with gaps requires scalar "
2282 "epilogue loop\n");
2283 if (loop->inner)
2284 {
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "Peeling for outer loop is not supported\n");
2288 return false;
2289 }
2290
2291 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2292 }
2293 }
2294
2295 return true;
2296 }
2297
2298
2299 /* Analyze the access pattern of the data-reference DR.
2300 In case of non-consecutive accesses call vect_analyze_group_access() to
2301 analyze groups of accesses. */
2302
2303 static bool
2304 vect_analyze_data_ref_access (struct data_reference *dr)
2305 {
2306 tree step = DR_STEP (dr);
2307 tree scalar_type = TREE_TYPE (DR_REF (dr));
2308 gimple stmt = DR_STMT (dr);
2309 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2310 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2311 struct loop *loop = NULL;
2312
2313 if (loop_vinfo)
2314 loop = LOOP_VINFO_LOOP (loop_vinfo);
2315
2316 if (loop_vinfo && !step)
2317 {
2318 if (dump_enabled_p ())
2319 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320 "bad data-ref access in loop\n");
2321 return false;
2322 }
2323
2324 /* Allow invariant loads in not nested loops. */
2325 if (loop_vinfo && integer_zerop (step))
2326 {
2327 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2328 if (nested_in_vect_loop_p (loop, stmt))
2329 {
2330 if (dump_enabled_p ())
2331 dump_printf_loc (MSG_NOTE, vect_location,
2332 "zero step in inner loop of nest\n");
2333 return false;
2334 }
2335 return DR_IS_READ (dr);
2336 }
2337
2338 if (loop && nested_in_vect_loop_p (loop, stmt))
2339 {
2340 /* Interleaved accesses are not yet supported within outer-loop
2341 vectorization for references in the inner-loop. */
2342 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2343
2344 /* For the rest of the analysis we use the outer-loop step. */
2345 step = STMT_VINFO_DR_STEP (stmt_info);
2346 if (integer_zerop (step))
2347 {
2348 if (dump_enabled_p ())
2349 dump_printf_loc (MSG_NOTE, vect_location,
2350 "zero step in outer loop.\n");
2351 if (DR_IS_READ (dr))
2352 return true;
2353 else
2354 return false;
2355 }
2356 }
2357
2358 /* Consecutive? */
2359 if (TREE_CODE (step) == INTEGER_CST)
2360 {
2361 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2362 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2363 || (dr_step < 0
2364 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2365 {
2366 /* Mark that it is not interleaving. */
2367 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2368 return true;
2369 }
2370 }
2371
2372 if (loop && nested_in_vect_loop_p (loop, stmt))
2373 {
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_NOTE, vect_location,
2376 "grouped access in outer loop.\n");
2377 return false;
2378 }
2379
2380 /* Assume this is a DR handled by non-constant strided load case. */
2381 if (TREE_CODE (step) != INTEGER_CST)
2382 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2383
2384 /* Not consecutive access - check if it's a part of interleaving group. */
2385 return vect_analyze_group_access (dr);
2386 }
2387
2388
2389
2390 /* A helper function used in the comparator function to sort data
2391 references. T1 and T2 are two data references to be compared.
2392 The function returns -1, 0, or 1. */
2393
2394 static int
2395 compare_tree (tree t1, tree t2)
2396 {
2397 int i, cmp;
2398 enum tree_code code;
2399 char tclass;
2400
2401 if (t1 == t2)
2402 return 0;
2403 if (t1 == NULL)
2404 return -1;
2405 if (t2 == NULL)
2406 return 1;
2407
2408
2409 if (TREE_CODE (t1) != TREE_CODE (t2))
2410 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2411
2412 code = TREE_CODE (t1);
2413 switch (code)
2414 {
2415 /* For const values, we can just use hash values for comparisons. */
2416 case INTEGER_CST:
2417 case REAL_CST:
2418 case FIXED_CST:
2419 case STRING_CST:
2420 case COMPLEX_CST:
2421 case VECTOR_CST:
2422 {
2423 hashval_t h1 = iterative_hash_expr (t1, 0);
2424 hashval_t h2 = iterative_hash_expr (t2, 0);
2425 if (h1 != h2)
2426 return h1 < h2 ? -1 : 1;
2427 break;
2428 }
2429
2430 case SSA_NAME:
2431 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2432 if (cmp != 0)
2433 return cmp;
2434
2435 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2436 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2437 break;
2438
2439 default:
2440 tclass = TREE_CODE_CLASS (code);
2441
2442 /* For var-decl, we could compare their UIDs. */
2443 if (tclass == tcc_declaration)
2444 {
2445 if (DECL_UID (t1) != DECL_UID (t2))
2446 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2447 break;
2448 }
2449
2450 /* For expressions with operands, compare their operands recursively. */
2451 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2452 {
2453 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2454 if (cmp != 0)
2455 return cmp;
2456 }
2457 }
2458
2459 return 0;
2460 }
2461
2462
2463 /* Compare two data-references DRA and DRB to group them into chunks
2464 suitable for grouping. */
2465
2466 static int
2467 dr_group_sort_cmp (const void *dra_, const void *drb_)
2468 {
2469 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2470 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2471 int cmp;
2472
2473 /* Stabilize sort. */
2474 if (dra == drb)
2475 return 0;
2476
2477 /* Ordering of DRs according to base. */
2478 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2479 {
2480 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2481 if (cmp != 0)
2482 return cmp;
2483 }
2484
2485 /* And according to DR_OFFSET. */
2486 if (!dr_equal_offsets_p (dra, drb))
2487 {
2488 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2489 if (cmp != 0)
2490 return cmp;
2491 }
2492
2493 /* Put reads before writes. */
2494 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2495 return DR_IS_READ (dra) ? -1 : 1;
2496
2497 /* Then sort after access size. */
2498 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2499 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2500 {
2501 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2502 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2503 if (cmp != 0)
2504 return cmp;
2505 }
2506
2507 /* And after step. */
2508 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2509 {
2510 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2511 if (cmp != 0)
2512 return cmp;
2513 }
2514
2515 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2516 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2517 if (cmp == 0)
2518 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2519 return cmp;
2520 }
2521
2522 /* Function vect_analyze_data_ref_accesses.
2523
2524 Analyze the access pattern of all the data references in the loop.
2525
2526 FORNOW: the only access pattern that is considered vectorizable is a
2527 simple step 1 (consecutive) access.
2528
2529 FORNOW: handle only arrays and pointer accesses. */
2530
2531 bool
2532 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2533 {
2534 unsigned int i;
2535 vec<data_reference_p> datarefs;
2536 struct data_reference *dr;
2537
2538 if (dump_enabled_p ())
2539 dump_printf_loc (MSG_NOTE, vect_location,
2540 "=== vect_analyze_data_ref_accesses ===\n");
2541
2542 if (loop_vinfo)
2543 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2544 else
2545 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2546
2547 if (datarefs.is_empty ())
2548 return true;
2549
2550 /* Sort the array of datarefs to make building the interleaving chains
2551 linear. Don't modify the original vector's order, it is needed for
2552 determining what dependencies are reversed. */
2553 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2554 datarefs_copy.qsort (dr_group_sort_cmp);
2555
2556 /* Build the interleaving chains. */
2557 for (i = 0; i < datarefs_copy.length () - 1;)
2558 {
2559 data_reference_p dra = datarefs_copy[i];
2560 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2561 stmt_vec_info lastinfo = NULL;
2562 for (i = i + 1; i < datarefs_copy.length (); ++i)
2563 {
2564 data_reference_p drb = datarefs_copy[i];
2565 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2566
2567 /* ??? Imperfect sorting (non-compatible types, non-modulo
2568 accesses, same accesses) can lead to a group to be artificially
2569 split here as we don't just skip over those. If it really
2570 matters we can push those to a worklist and re-iterate
2571 over them. The we can just skip ahead to the next DR here. */
2572
2573 /* Check that the data-refs have same first location (except init)
2574 and they are both either store or load (not load and store,
2575 not masked loads or stores). */
2576 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2577 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2578 DR_BASE_ADDRESS (drb), 0)
2579 || !dr_equal_offsets_p (dra, drb)
2580 || !gimple_assign_single_p (DR_STMT (dra))
2581 || !gimple_assign_single_p (DR_STMT (drb)))
2582 break;
2583
2584 /* Check that the data-refs have the same constant size and step. */
2585 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2586 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2587 if (!tree_fits_uhwi_p (sza)
2588 || !tree_fits_uhwi_p (szb)
2589 || !tree_int_cst_equal (sza, szb)
2590 || !tree_fits_shwi_p (DR_STEP (dra))
2591 || !tree_fits_shwi_p (DR_STEP (drb))
2592 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2593 break;
2594
2595 /* Do not place the same access in the interleaving chain twice. */
2596 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2597 break;
2598
2599 /* Check the types are compatible.
2600 ??? We don't distinguish this during sorting. */
2601 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2602 TREE_TYPE (DR_REF (drb))))
2603 break;
2604
2605 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2606 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2607 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2608 gcc_assert (init_a < init_b);
2609
2610 /* If init_b == init_a + the size of the type * k, we have an
2611 interleaving, and DRA is accessed before DRB. */
2612 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2613 if ((init_b - init_a) % type_size_a != 0)
2614 break;
2615
2616 /* The step (if not zero) is greater than the difference between
2617 data-refs' inits. This splits groups into suitable sizes. */
2618 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2619 if (step != 0 && step <= (init_b - init_a))
2620 break;
2621
2622 if (dump_enabled_p ())
2623 {
2624 dump_printf_loc (MSG_NOTE, vect_location,
2625 "Detected interleaving ");
2626 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2627 dump_printf (MSG_NOTE, " and ");
2628 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2629 dump_printf (MSG_NOTE, "\n");
2630 }
2631
2632 /* Link the found element into the group list. */
2633 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2634 {
2635 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2636 lastinfo = stmtinfo_a;
2637 }
2638 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2639 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2640 lastinfo = stmtinfo_b;
2641 }
2642 }
2643
2644 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2645 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2646 && !vect_analyze_data_ref_access (dr))
2647 {
2648 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2650 "not vectorized: complicated access pattern.\n");
2651
2652 if (bb_vinfo)
2653 {
2654 /* Mark the statement as not vectorizable. */
2655 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2656 continue;
2657 }
2658 else
2659 {
2660 datarefs_copy.release ();
2661 return false;
2662 }
2663 }
2664
2665 datarefs_copy.release ();
2666 return true;
2667 }
2668
2669
2670 /* Operator == between two dr_with_seg_len objects.
2671
2672 This equality operator is used to make sure two data refs
2673 are the same one so that we will consider to combine the
2674 aliasing checks of those two pairs of data dependent data
2675 refs. */
2676
2677 static bool
2678 operator == (const dr_with_seg_len& d1,
2679 const dr_with_seg_len& d2)
2680 {
2681 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2682 DR_BASE_ADDRESS (d2.dr), 0)
2683 && compare_tree (d1.offset, d2.offset) == 0
2684 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2685 }
2686
2687 /* Function comp_dr_with_seg_len_pair.
2688
2689 Comparison function for sorting objects of dr_with_seg_len_pair_t
2690 so that we can combine aliasing checks in one scan. */
2691
2692 static int
2693 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2694 {
2695 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2696 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2697
2698 const dr_with_seg_len &p11 = p1->first,
2699 &p12 = p1->second,
2700 &p21 = p2->first,
2701 &p22 = p2->second;
2702
2703 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2704 if a and c have the same basic address snd step, and b and d have the same
2705 address and step. Therefore, if any a&c or b&d don't have the same address
2706 and step, we don't care the order of those two pairs after sorting. */
2707 int comp_res;
2708
2709 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2710 DR_BASE_ADDRESS (p21.dr))) != 0)
2711 return comp_res;
2712 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2713 DR_BASE_ADDRESS (p22.dr))) != 0)
2714 return comp_res;
2715 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2716 return comp_res;
2717 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2718 return comp_res;
2719 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2720 return comp_res;
2721 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2722 return comp_res;
2723
2724 return 0;
2725 }
2726
2727 /* Function vect_vfa_segment_size.
2728
2729 Create an expression that computes the size of segment
2730 that will be accessed for a data reference. The functions takes into
2731 account that realignment loads may access one more vector.
2732
2733 Input:
2734 DR: The data reference.
2735 LENGTH_FACTOR: segment length to consider.
2736
2737 Return an expression whose value is the size of segment which will be
2738 accessed by DR. */
2739
2740 static tree
2741 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2742 {
2743 tree segment_length;
2744
2745 if (integer_zerop (DR_STEP (dr)))
2746 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2747 else
2748 segment_length = size_binop (MULT_EXPR,
2749 fold_convert (sizetype, DR_STEP (dr)),
2750 fold_convert (sizetype, length_factor));
2751
2752 if (vect_supportable_dr_alignment (dr, false)
2753 == dr_explicit_realign_optimized)
2754 {
2755 tree vector_size = TYPE_SIZE_UNIT
2756 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2757
2758 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2759 }
2760 return segment_length;
2761 }
2762
2763 /* Function vect_prune_runtime_alias_test_list.
2764
2765 Prune a list of ddrs to be tested at run-time by versioning for alias.
2766 Merge several alias checks into one if possible.
2767 Return FALSE if resulting list of ddrs is longer then allowed by
2768 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2769
2770 bool
2771 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2772 {
2773 vec<ddr_p> may_alias_ddrs =
2774 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2775 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2776 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2777 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2778 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2779
2780 ddr_p ddr;
2781 unsigned int i;
2782 tree length_factor;
2783
2784 if (dump_enabled_p ())
2785 dump_printf_loc (MSG_NOTE, vect_location,
2786 "=== vect_prune_runtime_alias_test_list ===\n");
2787
2788 if (may_alias_ddrs.is_empty ())
2789 return true;
2790
2791 /* Basically, for each pair of dependent data refs store_ptr_0
2792 and load_ptr_0, we create an expression:
2793
2794 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2795 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2796
2797 for aliasing checks. However, in some cases we can decrease
2798 the number of checks by combining two checks into one. For
2799 example, suppose we have another pair of data refs store_ptr_0
2800 and load_ptr_1, and if the following condition is satisfied:
2801
2802 load_ptr_0 < load_ptr_1 &&
2803 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2804
2805 (this condition means, in each iteration of vectorized loop,
2806 the accessed memory of store_ptr_0 cannot be between the memory
2807 of load_ptr_0 and load_ptr_1.)
2808
2809 we then can use only the following expression to finish the
2810 alising checks between store_ptr_0 & load_ptr_0 and
2811 store_ptr_0 & load_ptr_1:
2812
2813 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2814 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2815
2816 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2817 same basic address. */
2818
2819 comp_alias_ddrs.create (may_alias_ddrs.length ());
2820
2821 /* First, we collect all data ref pairs for aliasing checks. */
2822 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2823 {
2824 struct data_reference *dr_a, *dr_b;
2825 gimple dr_group_first_a, dr_group_first_b;
2826 tree segment_length_a, segment_length_b;
2827 gimple stmt_a, stmt_b;
2828
2829 dr_a = DDR_A (ddr);
2830 stmt_a = DR_STMT (DDR_A (ddr));
2831 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2832 if (dr_group_first_a)
2833 {
2834 stmt_a = dr_group_first_a;
2835 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2836 }
2837
2838 dr_b = DDR_B (ddr);
2839 stmt_b = DR_STMT (DDR_B (ddr));
2840 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2841 if (dr_group_first_b)
2842 {
2843 stmt_b = dr_group_first_b;
2844 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2845 }
2846
2847 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2848 length_factor = scalar_loop_iters;
2849 else
2850 length_factor = size_int (vect_factor);
2851 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2852 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2853
2854 dr_with_seg_len_pair_t dr_with_seg_len_pair
2855 (dr_with_seg_len (dr_a, segment_length_a),
2856 dr_with_seg_len (dr_b, segment_length_b));
2857
2858 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2859 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2860
2861 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2862 }
2863
2864 /* Second, we sort the collected data ref pairs so that we can scan
2865 them once to combine all possible aliasing checks. */
2866 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2867
2868 /* Third, we scan the sorted dr pairs and check if we can combine
2869 alias checks of two neighbouring dr pairs. */
2870 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2871 {
2872 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2873 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2874 *dr_b1 = &comp_alias_ddrs[i-1].second,
2875 *dr_a2 = &comp_alias_ddrs[i].first,
2876 *dr_b2 = &comp_alias_ddrs[i].second;
2877
2878 /* Remove duplicate data ref pairs. */
2879 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2880 {
2881 if (dump_enabled_p ())
2882 {
2883 dump_printf_loc (MSG_NOTE, vect_location,
2884 "found equal ranges ");
2885 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2886 DR_REF (dr_a1->dr));
2887 dump_printf (MSG_NOTE, ", ");
2888 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2889 DR_REF (dr_b1->dr));
2890 dump_printf (MSG_NOTE, " and ");
2891 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2892 DR_REF (dr_a2->dr));
2893 dump_printf (MSG_NOTE, ", ");
2894 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2895 DR_REF (dr_b2->dr));
2896 dump_printf (MSG_NOTE, "\n");
2897 }
2898
2899 comp_alias_ddrs.ordered_remove (i--);
2900 continue;
2901 }
2902
2903 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2904 {
2905 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2906 and DR_A1 and DR_A2 are two consecutive memrefs. */
2907 if (*dr_a1 == *dr_a2)
2908 {
2909 std::swap (dr_a1, dr_b1);
2910 std::swap (dr_a2, dr_b2);
2911 }
2912
2913 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2914 DR_BASE_ADDRESS (dr_a2->dr),
2915 0)
2916 || !tree_fits_shwi_p (dr_a1->offset)
2917 || !tree_fits_shwi_p (dr_a2->offset))
2918 continue;
2919
2920 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2921 - tree_to_shwi (dr_a1->offset));
2922
2923
2924 /* Now we check if the following condition is satisfied:
2925
2926 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2927
2928 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2929 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2930 have to make a best estimation. We can get the minimum value
2931 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2932 then either of the following two conditions can guarantee the
2933 one above:
2934
2935 1: DIFF <= MIN_SEG_LEN_B
2936 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2937
2938 */
2939
2940 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2941 ? tree_to_shwi (dr_b1->seg_len)
2942 : vect_factor);
2943
2944 if (diff <= min_seg_len_b
2945 || (tree_fits_shwi_p (dr_a1->seg_len)
2946 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2947 {
2948 if (dump_enabled_p ())
2949 {
2950 dump_printf_loc (MSG_NOTE, vect_location,
2951 "merging ranges for ");
2952 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2953 DR_REF (dr_a1->dr));
2954 dump_printf (MSG_NOTE, ", ");
2955 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2956 DR_REF (dr_b1->dr));
2957 dump_printf (MSG_NOTE, " and ");
2958 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2959 DR_REF (dr_a2->dr));
2960 dump_printf (MSG_NOTE, ", ");
2961 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2962 DR_REF (dr_b2->dr));
2963 dump_printf (MSG_NOTE, "\n");
2964 }
2965
2966 dr_a1->seg_len = size_binop (PLUS_EXPR,
2967 dr_a2->seg_len, size_int (diff));
2968 comp_alias_ddrs.ordered_remove (i--);
2969 }
2970 }
2971 }
2972
2973 dump_printf_loc (MSG_NOTE, vect_location,
2974 "improved number of alias checks from %d to %d\n",
2975 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2976 if ((int) comp_alias_ddrs.length () >
2977 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2978 return false;
2979
2980 return true;
2981 }
2982
2983 /* Check whether a non-affine read in stmt is suitable for gather load
2984 and if so, return a builtin decl for that operation. */
2985
2986 tree
2987 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2988 tree *offp, int *scalep)
2989 {
2990 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2991 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2992 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2993 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2994 tree offtype = NULL_TREE;
2995 tree decl, base, off;
2996 machine_mode pmode;
2997 int punsignedp, pvolatilep;
2998
2999 base = DR_REF (dr);
3000 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3001 see if we can use the def stmt of the address. */
3002 if (is_gimple_call (stmt)
3003 && gimple_call_internal_p (stmt)
3004 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3005 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3006 && TREE_CODE (base) == MEM_REF
3007 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3008 && integer_zerop (TREE_OPERAND (base, 1))
3009 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3010 {
3011 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3012 if (is_gimple_assign (def_stmt)
3013 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3014 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3015 }
3016
3017 /* The gather builtins need address of the form
3018 loop_invariant + vector * {1, 2, 4, 8}
3019 or
3020 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3021 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3022 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3023 multiplications and additions in it. To get a vector, we need
3024 a single SSA_NAME that will be defined in the loop and will
3025 contain everything that is not loop invariant and that can be
3026 vectorized. The following code attempts to find such a preexistng
3027 SSA_NAME OFF and put the loop invariants into a tree BASE
3028 that can be gimplified before the loop. */
3029 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3030 &pmode, &punsignedp, &pvolatilep, false);
3031 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3032
3033 if (TREE_CODE (base) == MEM_REF)
3034 {
3035 if (!integer_zerop (TREE_OPERAND (base, 1)))
3036 {
3037 if (off == NULL_TREE)
3038 {
3039 offset_int moff = mem_ref_offset (base);
3040 off = wide_int_to_tree (sizetype, moff);
3041 }
3042 else
3043 off = size_binop (PLUS_EXPR, off,
3044 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3045 }
3046 base = TREE_OPERAND (base, 0);
3047 }
3048 else
3049 base = build_fold_addr_expr (base);
3050
3051 if (off == NULL_TREE)
3052 off = size_zero_node;
3053
3054 /* If base is not loop invariant, either off is 0, then we start with just
3055 the constant offset in the loop invariant BASE and continue with base
3056 as OFF, otherwise give up.
3057 We could handle that case by gimplifying the addition of base + off
3058 into some SSA_NAME and use that as off, but for now punt. */
3059 if (!expr_invariant_in_loop_p (loop, base))
3060 {
3061 if (!integer_zerop (off))
3062 return NULL_TREE;
3063 off = base;
3064 base = size_int (pbitpos / BITS_PER_UNIT);
3065 }
3066 /* Otherwise put base + constant offset into the loop invariant BASE
3067 and continue with OFF. */
3068 else
3069 {
3070 base = fold_convert (sizetype, base);
3071 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3072 }
3073
3074 /* OFF at this point may be either a SSA_NAME or some tree expression
3075 from get_inner_reference. Try to peel off loop invariants from it
3076 into BASE as long as possible. */
3077 STRIP_NOPS (off);
3078 while (offtype == NULL_TREE)
3079 {
3080 enum tree_code code;
3081 tree op0, op1, add = NULL_TREE;
3082
3083 if (TREE_CODE (off) == SSA_NAME)
3084 {
3085 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3086
3087 if (expr_invariant_in_loop_p (loop, off))
3088 return NULL_TREE;
3089
3090 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3091 break;
3092
3093 op0 = gimple_assign_rhs1 (def_stmt);
3094 code = gimple_assign_rhs_code (def_stmt);
3095 op1 = gimple_assign_rhs2 (def_stmt);
3096 }
3097 else
3098 {
3099 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3100 return NULL_TREE;
3101 code = TREE_CODE (off);
3102 extract_ops_from_tree (off, &code, &op0, &op1);
3103 }
3104 switch (code)
3105 {
3106 case POINTER_PLUS_EXPR:
3107 case PLUS_EXPR:
3108 if (expr_invariant_in_loop_p (loop, op0))
3109 {
3110 add = op0;
3111 off = op1;
3112 do_add:
3113 add = fold_convert (sizetype, add);
3114 if (scale != 1)
3115 add = size_binop (MULT_EXPR, add, size_int (scale));
3116 base = size_binop (PLUS_EXPR, base, add);
3117 continue;
3118 }
3119 if (expr_invariant_in_loop_p (loop, op1))
3120 {
3121 add = op1;
3122 off = op0;
3123 goto do_add;
3124 }
3125 break;
3126 case MINUS_EXPR:
3127 if (expr_invariant_in_loop_p (loop, op1))
3128 {
3129 add = fold_convert (sizetype, op1);
3130 add = size_binop (MINUS_EXPR, size_zero_node, add);
3131 off = op0;
3132 goto do_add;
3133 }
3134 break;
3135 case MULT_EXPR:
3136 if (scale == 1 && tree_fits_shwi_p (op1))
3137 {
3138 scale = tree_to_shwi (op1);
3139 off = op0;
3140 continue;
3141 }
3142 break;
3143 case SSA_NAME:
3144 off = op0;
3145 continue;
3146 CASE_CONVERT:
3147 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3148 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3149 break;
3150 if (TYPE_PRECISION (TREE_TYPE (op0))
3151 == TYPE_PRECISION (TREE_TYPE (off)))
3152 {
3153 off = op0;
3154 continue;
3155 }
3156 if (TYPE_PRECISION (TREE_TYPE (op0))
3157 < TYPE_PRECISION (TREE_TYPE (off)))
3158 {
3159 off = op0;
3160 offtype = TREE_TYPE (off);
3161 STRIP_NOPS (off);
3162 continue;
3163 }
3164 break;
3165 default:
3166 break;
3167 }
3168 break;
3169 }
3170
3171 /* If at the end OFF still isn't a SSA_NAME or isn't
3172 defined in the loop, punt. */
3173 if (TREE_CODE (off) != SSA_NAME
3174 || expr_invariant_in_loop_p (loop, off))
3175 return NULL_TREE;
3176
3177 if (offtype == NULL_TREE)
3178 offtype = TREE_TYPE (off);
3179
3180 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3181 offtype, scale);
3182 if (decl == NULL_TREE)
3183 return NULL_TREE;
3184
3185 if (basep)
3186 *basep = base;
3187 if (offp)
3188 *offp = off;
3189 if (scalep)
3190 *scalep = scale;
3191 return decl;
3192 }
3193
3194 /* Function vect_analyze_data_refs.
3195
3196 Find all the data references in the loop or basic block.
3197
3198 The general structure of the analysis of data refs in the vectorizer is as
3199 follows:
3200 1- vect_analyze_data_refs(loop/bb): call
3201 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3202 in the loop/bb and their dependences.
3203 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3204 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3205 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3206
3207 */
3208
3209 bool
3210 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3211 bb_vec_info bb_vinfo,
3212 int *min_vf, unsigned *n_stmts)
3213 {
3214 struct loop *loop = NULL;
3215 basic_block bb = NULL;
3216 unsigned int i;
3217 vec<data_reference_p> datarefs;
3218 struct data_reference *dr;
3219 tree scalar_type;
3220
3221 if (dump_enabled_p ())
3222 dump_printf_loc (MSG_NOTE, vect_location,
3223 "=== vect_analyze_data_refs ===\n");
3224
3225 if (loop_vinfo)
3226 {
3227 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3228
3229 loop = LOOP_VINFO_LOOP (loop_vinfo);
3230 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3231 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3232 {
3233 if (dump_enabled_p ())
3234 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3235 "not vectorized: loop contains function calls"
3236 " or data references that cannot be analyzed\n");
3237 return false;
3238 }
3239
3240 for (i = 0; i < loop->num_nodes; i++)
3241 {
3242 gimple_stmt_iterator gsi;
3243
3244 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3245 {
3246 gimple stmt = gsi_stmt (gsi);
3247 if (is_gimple_debug (stmt))
3248 continue;
3249 ++*n_stmts;
3250 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3251 {
3252 if (is_gimple_call (stmt) && loop->safelen)
3253 {
3254 tree fndecl = gimple_call_fndecl (stmt), op;
3255 if (fndecl != NULL_TREE)
3256 {
3257 struct cgraph_node *node = cgraph_node::get (fndecl);
3258 if (node != NULL && node->simd_clones != NULL)
3259 {
3260 unsigned int j, n = gimple_call_num_args (stmt);
3261 for (j = 0; j < n; j++)
3262 {
3263 op = gimple_call_arg (stmt, j);
3264 if (DECL_P (op)
3265 || (REFERENCE_CLASS_P (op)
3266 && get_base_address (op)))
3267 break;
3268 }
3269 op = gimple_call_lhs (stmt);
3270 /* Ignore #pragma omp declare simd functions
3271 if they don't have data references in the
3272 call stmt itself. */
3273 if (j == n
3274 && !(op
3275 && (DECL_P (op)
3276 || (REFERENCE_CLASS_P (op)
3277 && get_base_address (op)))))
3278 continue;
3279 }
3280 }
3281 }
3282 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3283 if (dump_enabled_p ())
3284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285 "not vectorized: loop contains function "
3286 "calls or data references that cannot "
3287 "be analyzed\n");
3288 return false;
3289 }
3290 }
3291 }
3292
3293 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3294 }
3295 else
3296 {
3297 gimple_stmt_iterator gsi;
3298
3299 bb = BB_VINFO_BB (bb_vinfo);
3300 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3301 {
3302 gimple stmt = gsi_stmt (gsi);
3303 if (is_gimple_debug (stmt))
3304 continue;
3305 ++*n_stmts;
3306 if (!find_data_references_in_stmt (NULL, stmt,
3307 &BB_VINFO_DATAREFS (bb_vinfo)))
3308 {
3309 /* Mark the rest of the basic-block as unvectorizable. */
3310 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3311 {
3312 stmt = gsi_stmt (gsi);
3313 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3314 }
3315 break;
3316 }
3317 }
3318
3319 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3320 }
3321
3322 /* Go through the data-refs, check that the analysis succeeded. Update
3323 pointer from stmt_vec_info struct to DR and vectype. */
3324
3325 FOR_EACH_VEC_ELT (datarefs, i, dr)
3326 {
3327 gimple stmt;
3328 stmt_vec_info stmt_info;
3329 tree base, offset, init;
3330 bool gather = false;
3331 bool simd_lane_access = false;
3332 int vf;
3333
3334 again:
3335 if (!dr || !DR_REF (dr))
3336 {
3337 if (dump_enabled_p ())
3338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3339 "not vectorized: unhandled data-ref\n");
3340 return false;
3341 }
3342
3343 stmt = DR_STMT (dr);
3344 stmt_info = vinfo_for_stmt (stmt);
3345
3346 /* Discard clobbers from the dataref vector. We will remove
3347 clobber stmts during vectorization. */
3348 if (gimple_clobber_p (stmt))
3349 {
3350 free_data_ref (dr);
3351 if (i == datarefs.length () - 1)
3352 {
3353 datarefs.pop ();
3354 break;
3355 }
3356 datarefs.ordered_remove (i);
3357 dr = datarefs[i];
3358 goto again;
3359 }
3360
3361 /* Check that analysis of the data-ref succeeded. */
3362 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3363 || !DR_STEP (dr))
3364 {
3365 bool maybe_gather
3366 = DR_IS_READ (dr)
3367 && !TREE_THIS_VOLATILE (DR_REF (dr))
3368 && targetm.vectorize.builtin_gather != NULL;
3369 bool maybe_simd_lane_access
3370 = loop_vinfo && loop->simduid;
3371
3372 /* If target supports vector gather loads, or if this might be
3373 a SIMD lane access, see if they can't be used. */
3374 if (loop_vinfo
3375 && (maybe_gather || maybe_simd_lane_access)
3376 && !nested_in_vect_loop_p (loop, stmt))
3377 {
3378 struct data_reference *newdr
3379 = create_data_ref (NULL, loop_containing_stmt (stmt),
3380 DR_REF (dr), stmt, true);
3381 gcc_assert (newdr != NULL && DR_REF (newdr));
3382 if (DR_BASE_ADDRESS (newdr)
3383 && DR_OFFSET (newdr)
3384 && DR_INIT (newdr)
3385 && DR_STEP (newdr)
3386 && integer_zerop (DR_STEP (newdr)))
3387 {
3388 if (maybe_simd_lane_access)
3389 {
3390 tree off = DR_OFFSET (newdr);
3391 STRIP_NOPS (off);
3392 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3393 && TREE_CODE (off) == MULT_EXPR
3394 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3395 {
3396 tree step = TREE_OPERAND (off, 1);
3397 off = TREE_OPERAND (off, 0);
3398 STRIP_NOPS (off);
3399 if (CONVERT_EXPR_P (off)
3400 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3401 0)))
3402 < TYPE_PRECISION (TREE_TYPE (off)))
3403 off = TREE_OPERAND (off, 0);
3404 if (TREE_CODE (off) == SSA_NAME)
3405 {
3406 gimple def = SSA_NAME_DEF_STMT (off);
3407 tree reft = TREE_TYPE (DR_REF (newdr));
3408 if (is_gimple_call (def)
3409 && gimple_call_internal_p (def)
3410 && (gimple_call_internal_fn (def)
3411 == IFN_GOMP_SIMD_LANE))
3412 {
3413 tree arg = gimple_call_arg (def, 0);
3414 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3415 arg = SSA_NAME_VAR (arg);
3416 if (arg == loop->simduid
3417 /* For now. */
3418 && tree_int_cst_equal
3419 (TYPE_SIZE_UNIT (reft),
3420 step))
3421 {
3422 DR_OFFSET (newdr) = ssize_int (0);
3423 DR_STEP (newdr) = step;
3424 DR_ALIGNED_TO (newdr)
3425 = size_int (BIGGEST_ALIGNMENT);
3426 dr = newdr;
3427 simd_lane_access = true;
3428 }
3429 }
3430 }
3431 }
3432 }
3433 if (!simd_lane_access && maybe_gather)
3434 {
3435 dr = newdr;
3436 gather = true;
3437 }
3438 }
3439 if (!gather && !simd_lane_access)
3440 free_data_ref (newdr);
3441 }
3442
3443 if (!gather && !simd_lane_access)
3444 {
3445 if (dump_enabled_p ())
3446 {
3447 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3448 "not vectorized: data ref analysis "
3449 "failed ");
3450 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3451 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3452 }
3453
3454 if (bb_vinfo)
3455 break;
3456
3457 return false;
3458 }
3459 }
3460
3461 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3462 {
3463 if (dump_enabled_p ())
3464 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3465 "not vectorized: base addr of dr is a "
3466 "constant\n");
3467
3468 if (bb_vinfo)
3469 break;
3470
3471 if (gather || simd_lane_access)
3472 free_data_ref (dr);
3473 return false;
3474 }
3475
3476 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3477 {
3478 if (dump_enabled_p ())
3479 {
3480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3481 "not vectorized: volatile type ");
3482 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3483 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3484 }
3485
3486 if (bb_vinfo)
3487 break;
3488
3489 return false;
3490 }
3491
3492 if (stmt_can_throw_internal (stmt))
3493 {
3494 if (dump_enabled_p ())
3495 {
3496 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3497 "not vectorized: statement can throw an "
3498 "exception ");
3499 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3500 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3501 }
3502
3503 if (bb_vinfo)
3504 break;
3505
3506 if (gather || simd_lane_access)
3507 free_data_ref (dr);
3508 return false;
3509 }
3510
3511 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3512 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3513 {
3514 if (dump_enabled_p ())
3515 {
3516 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3517 "not vectorized: statement is bitfield "
3518 "access ");
3519 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3520 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3521 }
3522
3523 if (bb_vinfo)
3524 break;
3525
3526 if (gather || simd_lane_access)
3527 free_data_ref (dr);
3528 return false;
3529 }
3530
3531 base = unshare_expr (DR_BASE_ADDRESS (dr));
3532 offset = unshare_expr (DR_OFFSET (dr));
3533 init = unshare_expr (DR_INIT (dr));
3534
3535 if (is_gimple_call (stmt)
3536 && (!gimple_call_internal_p (stmt)
3537 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3538 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3539 {
3540 if (dump_enabled_p ())
3541 {
3542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3543 "not vectorized: dr in a call ");
3544 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3545 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3546 }
3547
3548 if (bb_vinfo)
3549 break;
3550
3551 if (gather || simd_lane_access)
3552 free_data_ref (dr);
3553 return false;
3554 }
3555
3556 /* Update DR field in stmt_vec_info struct. */
3557
3558 /* If the dataref is in an inner-loop of the loop that is considered for
3559 for vectorization, we also want to analyze the access relative to
3560 the outer-loop (DR contains information only relative to the
3561 inner-most enclosing loop). We do that by building a reference to the
3562 first location accessed by the inner-loop, and analyze it relative to
3563 the outer-loop. */
3564 if (loop && nested_in_vect_loop_p (loop, stmt))
3565 {
3566 tree outer_step, outer_base, outer_init;
3567 HOST_WIDE_INT pbitsize, pbitpos;
3568 tree poffset;
3569 machine_mode pmode;
3570 int punsignedp, pvolatilep;
3571 affine_iv base_iv, offset_iv;
3572 tree dinit;
3573
3574 /* Build a reference to the first location accessed by the
3575 inner-loop: *(BASE+INIT). (The first location is actually
3576 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3577 tree inner_base = build_fold_indirect_ref
3578 (fold_build_pointer_plus (base, init));
3579
3580 if (dump_enabled_p ())
3581 {
3582 dump_printf_loc (MSG_NOTE, vect_location,
3583 "analyze in outer-loop: ");
3584 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3585 dump_printf (MSG_NOTE, "\n");
3586 }
3587
3588 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3589 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3590 gcc_assert (outer_base != NULL_TREE);
3591
3592 if (pbitpos % BITS_PER_UNIT != 0)
3593 {
3594 if (dump_enabled_p ())
3595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3596 "failed: bit offset alignment.\n");
3597 return false;
3598 }
3599
3600 outer_base = build_fold_addr_expr (outer_base);
3601 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3602 &base_iv, false))
3603 {
3604 if (dump_enabled_p ())
3605 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3606 "failed: evolution of base is not affine.\n");
3607 return false;
3608 }
3609
3610 if (offset)
3611 {
3612 if (poffset)
3613 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3614 poffset);
3615 else
3616 poffset = offset;
3617 }
3618
3619 if (!poffset)
3620 {
3621 offset_iv.base = ssize_int (0);
3622 offset_iv.step = ssize_int (0);
3623 }
3624 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3625 &offset_iv, false))
3626 {
3627 if (dump_enabled_p ())
3628 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3629 "evolution of offset is not affine.\n");
3630 return false;
3631 }
3632
3633 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3634 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3635 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3636 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3637 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3638
3639 outer_step = size_binop (PLUS_EXPR,
3640 fold_convert (ssizetype, base_iv.step),
3641 fold_convert (ssizetype, offset_iv.step));
3642
3643 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3644 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3645 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3646 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3647 STMT_VINFO_DR_OFFSET (stmt_info) =
3648 fold_convert (ssizetype, offset_iv.base);
3649 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3650 size_int (highest_pow2_factor (offset_iv.base));
3651
3652 if (dump_enabled_p ())
3653 {
3654 dump_printf_loc (MSG_NOTE, vect_location,
3655 "\touter base_address: ");
3656 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3658 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_DR_OFFSET (stmt_info));
3661 dump_printf (MSG_NOTE,
3662 "\n\touter constant offset from base address: ");
3663 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3664 STMT_VINFO_DR_INIT (stmt_info));
3665 dump_printf (MSG_NOTE, "\n\touter step: ");
3666 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3667 STMT_VINFO_DR_STEP (stmt_info));
3668 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3669 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3670 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3671 dump_printf (MSG_NOTE, "\n");
3672 }
3673 }
3674
3675 if (STMT_VINFO_DATA_REF (stmt_info))
3676 {
3677 if (dump_enabled_p ())
3678 {
3679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3680 "not vectorized: more than one data ref "
3681 "in stmt: ");
3682 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3683 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3684 }
3685
3686 if (bb_vinfo)
3687 break;
3688
3689 if (gather || simd_lane_access)
3690 free_data_ref (dr);
3691 return false;
3692 }
3693
3694 STMT_VINFO_DATA_REF (stmt_info) = dr;
3695 if (simd_lane_access)
3696 {
3697 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3698 free_data_ref (datarefs[i]);
3699 datarefs[i] = dr;
3700 }
3701
3702 /* Set vectype for STMT. */
3703 scalar_type = TREE_TYPE (DR_REF (dr));
3704 STMT_VINFO_VECTYPE (stmt_info)
3705 = get_vectype_for_scalar_type (scalar_type);
3706 if (!STMT_VINFO_VECTYPE (stmt_info))
3707 {
3708 if (dump_enabled_p ())
3709 {
3710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3711 "not vectorized: no vectype for stmt: ");
3712 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3713 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3714 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3715 scalar_type);
3716 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3717 }
3718
3719 if (bb_vinfo)
3720 break;
3721
3722 if (gather || simd_lane_access)
3723 {
3724 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3725 if (gather)
3726 free_data_ref (dr);
3727 }
3728 return false;
3729 }
3730 else
3731 {
3732 if (dump_enabled_p ())
3733 {
3734 dump_printf_loc (MSG_NOTE, vect_location,
3735 "got vectype for stmt: ");
3736 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3737 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3738 STMT_VINFO_VECTYPE (stmt_info));
3739 dump_printf (MSG_NOTE, "\n");
3740 }
3741 }
3742
3743 /* Adjust the minimal vectorization factor according to the
3744 vector type. */
3745 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3746 if (vf > *min_vf)
3747 *min_vf = vf;
3748
3749 if (gather)
3750 {
3751 tree off;
3752
3753 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3754 if (gather
3755 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3756 gather = false;
3757 if (!gather)
3758 {
3759 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3760 free_data_ref (dr);
3761 if (dump_enabled_p ())
3762 {
3763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3764 "not vectorized: not suitable for gather "
3765 "load ");
3766 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3767 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3768 }
3769 return false;
3770 }
3771
3772 datarefs[i] = dr;
3773 STMT_VINFO_GATHER_P (stmt_info) = true;
3774 }
3775 else if (loop_vinfo
3776 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3777 {
3778 if (nested_in_vect_loop_p (loop, stmt)
3779 || !DR_IS_READ (dr))
3780 {
3781 if (dump_enabled_p ())
3782 {
3783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3784 "not vectorized: not suitable for strided "
3785 "load ");
3786 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3787 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3788 }
3789 return false;
3790 }
3791 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3792 }
3793 }
3794
3795 /* If we stopped analysis at the first dataref we could not analyze
3796 when trying to vectorize a basic-block mark the rest of the datarefs
3797 as not vectorizable and truncate the vector of datarefs. That
3798 avoids spending useless time in analyzing their dependence. */
3799 if (i != datarefs.length ())
3800 {
3801 gcc_assert (bb_vinfo != NULL);
3802 for (unsigned j = i; j < datarefs.length (); ++j)
3803 {
3804 data_reference_p dr = datarefs[j];
3805 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3806 free_data_ref (dr);
3807 }
3808 datarefs.truncate (i);
3809 }
3810
3811 return true;
3812 }
3813
3814
3815 /* Function vect_get_new_vect_var.
3816
3817 Returns a name for a new variable. The current naming scheme appends the
3818 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3819 the name of vectorizer generated variables, and appends that to NAME if
3820 provided. */
3821
3822 tree
3823 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3824 {
3825 const char *prefix;
3826 tree new_vect_var;
3827
3828 switch (var_kind)
3829 {
3830 case vect_simple_var:
3831 prefix = "vect";
3832 break;
3833 case vect_scalar_var:
3834 prefix = "stmp";
3835 break;
3836 case vect_pointer_var:
3837 prefix = "vectp";
3838 break;
3839 default:
3840 gcc_unreachable ();
3841 }
3842
3843 if (name)
3844 {
3845 char* tmp = concat (prefix, "_", name, NULL);
3846 new_vect_var = create_tmp_reg (type, tmp);
3847 free (tmp);
3848 }
3849 else
3850 new_vect_var = create_tmp_reg (type, prefix);
3851
3852 return new_vect_var;
3853 }
3854
3855
3856 /* Function vect_create_addr_base_for_vector_ref.
3857
3858 Create an expression that computes the address of the first memory location
3859 that will be accessed for a data reference.
3860
3861 Input:
3862 STMT: The statement containing the data reference.
3863 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3864 OFFSET: Optional. If supplied, it is be added to the initial address.
3865 LOOP: Specify relative to which loop-nest should the address be computed.
3866 For example, when the dataref is in an inner-loop nested in an
3867 outer-loop that is now being vectorized, LOOP can be either the
3868 outer-loop, or the inner-loop. The first memory location accessed
3869 by the following dataref ('in' points to short):
3870
3871 for (i=0; i<N; i++)
3872 for (j=0; j<M; j++)
3873 s += in[i+j]
3874
3875 is as follows:
3876 if LOOP=i_loop: &in (relative to i_loop)
3877 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3878 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3879 initial address. Unlike OFFSET, which is number of elements to
3880 be added, BYTE_OFFSET is measured in bytes.
3881
3882 Output:
3883 1. Return an SSA_NAME whose value is the address of the memory location of
3884 the first vector of the data reference.
3885 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3886 these statement(s) which define the returned SSA_NAME.
3887
3888 FORNOW: We are only handling array accesses with step 1. */
3889
3890 tree
3891 vect_create_addr_base_for_vector_ref (gimple stmt,
3892 gimple_seq *new_stmt_list,
3893 tree offset,
3894 struct loop *loop,
3895 tree byte_offset)
3896 {
3897 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3898 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3899 tree data_ref_base;
3900 const char *base_name;
3901 tree addr_base;
3902 tree dest;
3903 gimple_seq seq = NULL;
3904 tree base_offset;
3905 tree init;
3906 tree vect_ptr_type;
3907 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3908 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3909
3910 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3911 {
3912 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3913
3914 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3915
3916 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3917 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3918 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3919 }
3920 else
3921 {
3922 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3923 base_offset = unshare_expr (DR_OFFSET (dr));
3924 init = unshare_expr (DR_INIT (dr));
3925 }
3926
3927 if (loop_vinfo)
3928 base_name = get_name (data_ref_base);
3929 else
3930 {
3931 base_offset = ssize_int (0);
3932 init = ssize_int (0);
3933 base_name = get_name (DR_REF (dr));
3934 }
3935
3936 /* Create base_offset */
3937 base_offset = size_binop (PLUS_EXPR,
3938 fold_convert (sizetype, base_offset),
3939 fold_convert (sizetype, init));
3940
3941 if (offset)
3942 {
3943 offset = fold_build2 (MULT_EXPR, sizetype,
3944 fold_convert (sizetype, offset), step);
3945 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3946 base_offset, offset);
3947 }
3948 if (byte_offset)
3949 {
3950 byte_offset = fold_convert (sizetype, byte_offset);
3951 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3952 base_offset, byte_offset);
3953 }
3954
3955 /* base + base_offset */
3956 if (loop_vinfo)
3957 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3958 else
3959 {
3960 addr_base = build1 (ADDR_EXPR,
3961 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3962 unshare_expr (DR_REF (dr)));
3963 }
3964
3965 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3966 addr_base = fold_convert (vect_ptr_type, addr_base);
3967 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3968 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3969 gimple_seq_add_seq (new_stmt_list, seq);
3970
3971 if (DR_PTR_INFO (dr)
3972 && TREE_CODE (addr_base) == SSA_NAME)
3973 {
3974 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3975 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3976 int misalign = DR_MISALIGNMENT (dr);
3977 if (offset || byte_offset || (misalign == -1))
3978 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3979 else
3980 set_ptr_info_alignment (SSA_NAME_PTR_INFO (addr_base), align, misalign);
3981 }
3982
3983 if (dump_enabled_p ())
3984 {
3985 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3986 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3987 dump_printf (MSG_NOTE, "\n");
3988 }
3989
3990 return addr_base;
3991 }
3992
3993
3994 /* Function vect_create_data_ref_ptr.
3995
3996 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3997 location accessed in the loop by STMT, along with the def-use update
3998 chain to appropriately advance the pointer through the loop iterations.
3999 Also set aliasing information for the pointer. This pointer is used by
4000 the callers to this function to create a memory reference expression for
4001 vector load/store access.
4002
4003 Input:
4004 1. STMT: a stmt that references memory. Expected to be of the form
4005 GIMPLE_ASSIGN <name, data-ref> or
4006 GIMPLE_ASSIGN <data-ref, name>.
4007 2. AGGR_TYPE: the type of the reference, which should be either a vector
4008 or an array.
4009 3. AT_LOOP: the loop where the vector memref is to be created.
4010 4. OFFSET (optional): an offset to be added to the initial address accessed
4011 by the data-ref in STMT.
4012 5. BSI: location where the new stmts are to be placed if there is no loop
4013 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4014 pointing to the initial address.
4015 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4016 to the initial address accessed by the data-ref in STMT. This is
4017 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4018 in bytes.
4019
4020 Output:
4021 1. Declare a new ptr to vector_type, and have it point to the base of the
4022 data reference (initial addressed accessed by the data reference).
4023 For example, for vector of type V8HI, the following code is generated:
4024
4025 v8hi *ap;
4026 ap = (v8hi *)initial_address;
4027
4028 if OFFSET is not supplied:
4029 initial_address = &a[init];
4030 if OFFSET is supplied:
4031 initial_address = &a[init + OFFSET];
4032 if BYTE_OFFSET is supplied:
4033 initial_address = &a[init] + BYTE_OFFSET;
4034
4035 Return the initial_address in INITIAL_ADDRESS.
4036
4037 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4038 update the pointer in each iteration of the loop.
4039
4040 Return the increment stmt that updates the pointer in PTR_INCR.
4041
4042 3. Set INV_P to true if the access pattern of the data reference in the
4043 vectorized loop is invariant. Set it to false otherwise.
4044
4045 4. Return the pointer. */
4046
4047 tree
4048 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4049 tree offset, tree *initial_address,
4050 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4051 bool only_init, bool *inv_p, tree byte_offset)
4052 {
4053 const char *base_name;
4054 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4055 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4056 struct loop *loop = NULL;
4057 bool nested_in_vect_loop = false;
4058 struct loop *containing_loop = NULL;
4059 tree aggr_ptr_type;
4060 tree aggr_ptr;
4061 tree new_temp;
4062 gimple vec_stmt;
4063 gimple_seq new_stmt_list = NULL;
4064 edge pe = NULL;
4065 basic_block new_bb;
4066 tree aggr_ptr_init;
4067 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4068 tree aptr;
4069 gimple_stmt_iterator incr_gsi;
4070 bool insert_after;
4071 tree indx_before_incr, indx_after_incr;
4072 gimple incr;
4073 tree step;
4074 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4075
4076 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4077 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4078
4079 if (loop_vinfo)
4080 {
4081 loop = LOOP_VINFO_LOOP (loop_vinfo);
4082 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4083 containing_loop = (gimple_bb (stmt))->loop_father;
4084 pe = loop_preheader_edge (loop);
4085 }
4086 else
4087 {
4088 gcc_assert (bb_vinfo);
4089 only_init = true;
4090 *ptr_incr = NULL;
4091 }
4092
4093 /* Check the step (evolution) of the load in LOOP, and record
4094 whether it's invariant. */
4095 if (nested_in_vect_loop)
4096 step = STMT_VINFO_DR_STEP (stmt_info);
4097 else
4098 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4099
4100 if (integer_zerop (step))
4101 *inv_p = true;
4102 else
4103 *inv_p = false;
4104
4105 /* Create an expression for the first address accessed by this load
4106 in LOOP. */
4107 base_name = get_name (DR_BASE_ADDRESS (dr));
4108
4109 if (dump_enabled_p ())
4110 {
4111 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4112 dump_printf_loc (MSG_NOTE, vect_location,
4113 "create %s-pointer variable to type: ",
4114 get_tree_code_name (TREE_CODE (aggr_type)));
4115 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4116 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4117 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4118 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4119 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4120 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4121 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4122 else
4123 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4124 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4125 dump_printf (MSG_NOTE, "\n");
4126 }
4127
4128 /* (1) Create the new aggregate-pointer variable.
4129 Vector and array types inherit the alias set of their component
4130 type by default so we need to use a ref-all pointer if the data
4131 reference does not conflict with the created aggregated data
4132 reference because it is not addressable. */
4133 bool need_ref_all = false;
4134 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4135 get_alias_set (DR_REF (dr))))
4136 need_ref_all = true;
4137 /* Likewise for any of the data references in the stmt group. */
4138 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4139 {
4140 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4141 do
4142 {
4143 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4144 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4145 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4146 get_alias_set (DR_REF (sdr))))
4147 {
4148 need_ref_all = true;
4149 break;
4150 }
4151 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4152 }
4153 while (orig_stmt);
4154 }
4155 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4156 need_ref_all);
4157 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4158
4159
4160 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4161 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4162 def-use update cycles for the pointer: one relative to the outer-loop
4163 (LOOP), which is what steps (3) and (4) below do. The other is relative
4164 to the inner-loop (which is the inner-most loop containing the dataref),
4165 and this is done be step (5) below.
4166
4167 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4168 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4169 redundant. Steps (3),(4) create the following:
4170
4171 vp0 = &base_addr;
4172 LOOP: vp1 = phi(vp0,vp2)
4173 ...
4174 ...
4175 vp2 = vp1 + step
4176 goto LOOP
4177
4178 If there is an inner-loop nested in loop, then step (5) will also be
4179 applied, and an additional update in the inner-loop will be created:
4180
4181 vp0 = &base_addr;
4182 LOOP: vp1 = phi(vp0,vp2)
4183 ...
4184 inner: vp3 = phi(vp1,vp4)
4185 vp4 = vp3 + inner_step
4186 if () goto inner
4187 ...
4188 vp2 = vp1 + step
4189 if () goto LOOP */
4190
4191 /* (2) Calculate the initial address of the aggregate-pointer, and set
4192 the aggregate-pointer to point to it before the loop. */
4193
4194 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4195
4196 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4197 offset, loop, byte_offset);
4198 if (new_stmt_list)
4199 {
4200 if (pe)
4201 {
4202 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4203 gcc_assert (!new_bb);
4204 }
4205 else
4206 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4207 }
4208
4209 *initial_address = new_temp;
4210
4211 /* Create: p = (aggr_type *) initial_base */
4212 if (TREE_CODE (new_temp) != SSA_NAME
4213 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4214 {
4215 vec_stmt = gimple_build_assign (aggr_ptr,
4216 fold_convert (aggr_ptr_type, new_temp));
4217 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4218 /* Copy the points-to information if it exists. */
4219 if (DR_PTR_INFO (dr))
4220 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
4221 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4222 if (pe)
4223 {
4224 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4225 gcc_assert (!new_bb);
4226 }
4227 else
4228 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4229 }
4230 else
4231 aggr_ptr_init = new_temp;
4232
4233 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4234 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4235 inner-loop nested in LOOP (during outer-loop vectorization). */
4236
4237 /* No update in loop is required. */
4238 if (only_init && (!loop_vinfo || at_loop == loop))
4239 aptr = aggr_ptr_init;
4240 else
4241 {
4242 /* The step of the aggregate pointer is the type size. */
4243 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4244 /* One exception to the above is when the scalar step of the load in
4245 LOOP is zero. In this case the step here is also zero. */
4246 if (*inv_p)
4247 iv_step = size_zero_node;
4248 else if (tree_int_cst_sgn (step) == -1)
4249 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4250
4251 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4252
4253 create_iv (aggr_ptr_init,
4254 fold_convert (aggr_ptr_type, iv_step),
4255 aggr_ptr, loop, &incr_gsi, insert_after,
4256 &indx_before_incr, &indx_after_incr);
4257 incr = gsi_stmt (incr_gsi);
4258 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4259
4260 /* Copy the points-to information if it exists. */
4261 if (DR_PTR_INFO (dr))
4262 {
4263 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4264 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4265 }
4266 if (ptr_incr)
4267 *ptr_incr = incr;
4268
4269 aptr = indx_before_incr;
4270 }
4271
4272 if (!nested_in_vect_loop || only_init)
4273 return aptr;
4274
4275
4276 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4277 nested in LOOP, if exists. */
4278
4279 gcc_assert (nested_in_vect_loop);
4280 if (!only_init)
4281 {
4282 standard_iv_increment_position (containing_loop, &incr_gsi,
4283 &insert_after);
4284 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4285 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4286 &indx_after_incr);
4287 incr = gsi_stmt (incr_gsi);
4288 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4289
4290 /* Copy the points-to information if it exists. */
4291 if (DR_PTR_INFO (dr))
4292 {
4293 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
4294 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
4295 }
4296 if (ptr_incr)
4297 *ptr_incr = incr;
4298
4299 return indx_before_incr;
4300 }
4301 else
4302 gcc_unreachable ();
4303 }
4304
4305
4306 /* Function bump_vector_ptr
4307
4308 Increment a pointer (to a vector type) by vector-size. If requested,
4309 i.e. if PTR-INCR is given, then also connect the new increment stmt
4310 to the existing def-use update-chain of the pointer, by modifying
4311 the PTR_INCR as illustrated below:
4312
4313 The pointer def-use update-chain before this function:
4314 DATAREF_PTR = phi (p_0, p_2)
4315 ....
4316 PTR_INCR: p_2 = DATAREF_PTR + step
4317
4318 The pointer def-use update-chain after this function:
4319 DATAREF_PTR = phi (p_0, p_2)
4320 ....
4321 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4322 ....
4323 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4324
4325 Input:
4326 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4327 in the loop.
4328 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4329 the loop. The increment amount across iterations is expected
4330 to be vector_size.
4331 BSI - location where the new update stmt is to be placed.
4332 STMT - the original scalar memory-access stmt that is being vectorized.
4333 BUMP - optional. The offset by which to bump the pointer. If not given,
4334 the offset is assumed to be vector_size.
4335
4336 Output: Return NEW_DATAREF_PTR as illustrated above.
4337
4338 */
4339
4340 tree
4341 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4342 gimple stmt, tree bump)
4343 {
4344 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4345 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4346 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4347 tree update = TYPE_SIZE_UNIT (vectype);
4348 gassign *incr_stmt;
4349 ssa_op_iter iter;
4350 use_operand_p use_p;
4351 tree new_dataref_ptr;
4352
4353 if (bump)
4354 update = bump;
4355
4356 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4357 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4358 dataref_ptr, update);
4359 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4360
4361 /* Copy the points-to information if it exists. */
4362 if (DR_PTR_INFO (dr))
4363 {
4364 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4365 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4366 }
4367
4368 if (!ptr_incr)
4369 return new_dataref_ptr;
4370
4371 /* Update the vector-pointer's cross-iteration increment. */
4372 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4373 {
4374 tree use = USE_FROM_PTR (use_p);
4375
4376 if (use == dataref_ptr)
4377 SET_USE (use_p, new_dataref_ptr);
4378 else
4379 gcc_assert (tree_int_cst_compare (use, update) == 0);
4380 }
4381
4382 return new_dataref_ptr;
4383 }
4384
4385
4386 /* Function vect_create_destination_var.
4387
4388 Create a new temporary of type VECTYPE. */
4389
4390 tree
4391 vect_create_destination_var (tree scalar_dest, tree vectype)
4392 {
4393 tree vec_dest;
4394 const char *name;
4395 char *new_name;
4396 tree type;
4397 enum vect_var_kind kind;
4398
4399 kind = vectype ? vect_simple_var : vect_scalar_var;
4400 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4401
4402 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4403
4404 name = get_name (scalar_dest);
4405 if (name)
4406 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4407 else
4408 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4409 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4410 free (new_name);
4411
4412 return vec_dest;
4413 }
4414
4415 /* Function vect_grouped_store_supported.
4416
4417 Returns TRUE if interleave high and interleave low permutations
4418 are supported, and FALSE otherwise. */
4419
4420 bool
4421 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4422 {
4423 machine_mode mode = TYPE_MODE (vectype);
4424
4425 /* vect_permute_store_chain requires the group size to be equal to 3 or
4426 be a power of two. */
4427 if (count != 3 && exact_log2 (count) == -1)
4428 {
4429 if (dump_enabled_p ())
4430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4431 "the size of the group of accesses"
4432 " is not a power of 2 or not eqaul to 3\n");
4433 return false;
4434 }
4435
4436 /* Check that the permutation is supported. */
4437 if (VECTOR_MODE_P (mode))
4438 {
4439 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4440 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4441
4442 if (count == 3)
4443 {
4444 unsigned int j0 = 0, j1 = 0, j2 = 0;
4445 unsigned int i, j;
4446
4447 for (j = 0; j < 3; j++)
4448 {
4449 int nelt0 = ((3 - j) * nelt) % 3;
4450 int nelt1 = ((3 - j) * nelt + 1) % 3;
4451 int nelt2 = ((3 - j) * nelt + 2) % 3;
4452 for (i = 0; i < nelt; i++)
4453 {
4454 if (3 * i + nelt0 < nelt)
4455 sel[3 * i + nelt0] = j0++;
4456 if (3 * i + nelt1 < nelt)
4457 sel[3 * i + nelt1] = nelt + j1++;
4458 if (3 * i + nelt2 < nelt)
4459 sel[3 * i + nelt2] = 0;
4460 }
4461 if (!can_vec_perm_p (mode, false, sel))
4462 {
4463 if (dump_enabled_p ())
4464 dump_printf (MSG_MISSED_OPTIMIZATION,
4465 "permutaion op not supported by target.\n");
4466 return false;
4467 }
4468
4469 for (i = 0; i < nelt; i++)
4470 {
4471 if (3 * i + nelt0 < nelt)
4472 sel[3 * i + nelt0] = 3 * i + nelt0;
4473 if (3 * i + nelt1 < nelt)
4474 sel[3 * i + nelt1] = 3 * i + nelt1;
4475 if (3 * i + nelt2 < nelt)
4476 sel[3 * i + nelt2] = nelt + j2++;
4477 }
4478 if (!can_vec_perm_p (mode, false, sel))
4479 {
4480 if (dump_enabled_p ())
4481 dump_printf (MSG_MISSED_OPTIMIZATION,
4482 "permutaion op not supported by target.\n");
4483 return false;
4484 }
4485 }
4486 return true;
4487 }
4488 else
4489 {
4490 /* If length is not equal to 3 then only power of 2 is supported. */
4491 gcc_assert (exact_log2 (count) != -1);
4492
4493 for (i = 0; i < nelt / 2; i++)
4494 {
4495 sel[i * 2] = i;
4496 sel[i * 2 + 1] = i + nelt;
4497 }
4498 if (can_vec_perm_p (mode, false, sel))
4499 {
4500 for (i = 0; i < nelt; i++)
4501 sel[i] += nelt / 2;
4502 if (can_vec_perm_p (mode, false, sel))
4503 return true;
4504 }
4505 }
4506 }
4507
4508 if (dump_enabled_p ())
4509 dump_printf (MSG_MISSED_OPTIMIZATION,
4510 "permutaion op not supported by target.\n");
4511 return false;
4512 }
4513
4514
4515 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4516 type VECTYPE. */
4517
4518 bool
4519 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4520 {
4521 return vect_lanes_optab_supported_p ("vec_store_lanes",
4522 vec_store_lanes_optab,
4523 vectype, count);
4524 }
4525
4526
4527 /* Function vect_permute_store_chain.
4528
4529 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4530 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4531 the data correctly for the stores. Return the final references for stores
4532 in RESULT_CHAIN.
4533
4534 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4535 The input is 4 vectors each containing 8 elements. We assign a number to
4536 each element, the input sequence is:
4537
4538 1st vec: 0 1 2 3 4 5 6 7
4539 2nd vec: 8 9 10 11 12 13 14 15
4540 3rd vec: 16 17 18 19 20 21 22 23
4541 4th vec: 24 25 26 27 28 29 30 31
4542
4543 The output sequence should be:
4544
4545 1st vec: 0 8 16 24 1 9 17 25
4546 2nd vec: 2 10 18 26 3 11 19 27
4547 3rd vec: 4 12 20 28 5 13 21 30
4548 4th vec: 6 14 22 30 7 15 23 31
4549
4550 i.e., we interleave the contents of the four vectors in their order.
4551
4552 We use interleave_high/low instructions to create such output. The input of
4553 each interleave_high/low operation is two vectors:
4554 1st vec 2nd vec
4555 0 1 2 3 4 5 6 7
4556 the even elements of the result vector are obtained left-to-right from the
4557 high/low elements of the first vector. The odd elements of the result are
4558 obtained left-to-right from the high/low elements of the second vector.
4559 The output of interleave_high will be: 0 4 1 5
4560 and of interleave_low: 2 6 3 7
4561
4562
4563 The permutation is done in log LENGTH stages. In each stage interleave_high
4564 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4565 where the first argument is taken from the first half of DR_CHAIN and the
4566 second argument from it's second half.
4567 In our example,
4568
4569 I1: interleave_high (1st vec, 3rd vec)
4570 I2: interleave_low (1st vec, 3rd vec)
4571 I3: interleave_high (2nd vec, 4th vec)
4572 I4: interleave_low (2nd vec, 4th vec)
4573
4574 The output for the first stage is:
4575
4576 I1: 0 16 1 17 2 18 3 19
4577 I2: 4 20 5 21 6 22 7 23
4578 I3: 8 24 9 25 10 26 11 27
4579 I4: 12 28 13 29 14 30 15 31
4580
4581 The output of the second stage, i.e. the final result is:
4582
4583 I1: 0 8 16 24 1 9 17 25
4584 I2: 2 10 18 26 3 11 19 27
4585 I3: 4 12 20 28 5 13 21 30
4586 I4: 6 14 22 30 7 15 23 31. */
4587
4588 void
4589 vect_permute_store_chain (vec<tree> dr_chain,
4590 unsigned int length,
4591 gimple stmt,
4592 gimple_stmt_iterator *gsi,
4593 vec<tree> *result_chain)
4594 {
4595 tree vect1, vect2, high, low;
4596 gimple perm_stmt;
4597 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4598 tree perm_mask_low, perm_mask_high;
4599 tree data_ref;
4600 tree perm3_mask_low, perm3_mask_high;
4601 unsigned int i, n, log_length = exact_log2 (length);
4602 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4603 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4604
4605 result_chain->quick_grow (length);
4606 memcpy (result_chain->address (), dr_chain.address (),
4607 length * sizeof (tree));
4608
4609 if (length == 3)
4610 {
4611 unsigned int j0 = 0, j1 = 0, j2 = 0;
4612
4613 for (j = 0; j < 3; j++)
4614 {
4615 int nelt0 = ((3 - j) * nelt) % 3;
4616 int nelt1 = ((3 - j) * nelt + 1) % 3;
4617 int nelt2 = ((3 - j) * nelt + 2) % 3;
4618
4619 for (i = 0; i < nelt; i++)
4620 {
4621 if (3 * i + nelt0 < nelt)
4622 sel[3 * i + nelt0] = j0++;
4623 if (3 * i + nelt1 < nelt)
4624 sel[3 * i + nelt1] = nelt + j1++;
4625 if (3 * i + nelt2 < nelt)
4626 sel[3 * i + nelt2] = 0;
4627 }
4628 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4629
4630 for (i = 0; i < nelt; i++)
4631 {
4632 if (3 * i + nelt0 < nelt)
4633 sel[3 * i + nelt0] = 3 * i + nelt0;
4634 if (3 * i + nelt1 < nelt)
4635 sel[3 * i + nelt1] = 3 * i + nelt1;
4636 if (3 * i + nelt2 < nelt)
4637 sel[3 * i + nelt2] = nelt + j2++;
4638 }
4639 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4640
4641 vect1 = dr_chain[0];
4642 vect2 = dr_chain[1];
4643
4644 /* Create interleaving stmt:
4645 low = VEC_PERM_EXPR <vect1, vect2,
4646 {j, nelt, *, j + 1, nelt + j + 1, *,
4647 j + 2, nelt + j + 2, *, ...}> */
4648 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4649 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4650 vect2, perm3_mask_low);
4651 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4652
4653 vect1 = data_ref;
4654 vect2 = dr_chain[2];
4655 /* Create interleaving stmt:
4656 low = VEC_PERM_EXPR <vect1, vect2,
4657 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4658 6, 7, nelt + j + 2, ...}> */
4659 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4660 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4661 vect2, perm3_mask_high);
4662 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4663 (*result_chain)[j] = data_ref;
4664 }
4665 }
4666 else
4667 {
4668 /* If length is not equal to 3 then only power of 2 is supported. */
4669 gcc_assert (exact_log2 (length) != -1);
4670
4671 for (i = 0, n = nelt / 2; i < n; i++)
4672 {
4673 sel[i * 2] = i;
4674 sel[i * 2 + 1] = i + nelt;
4675 }
4676 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4677
4678 for (i = 0; i < nelt; i++)
4679 sel[i] += nelt / 2;
4680 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4681
4682 for (i = 0, n = log_length; i < n; i++)
4683 {
4684 for (j = 0; j < length/2; j++)
4685 {
4686 vect1 = dr_chain[j];
4687 vect2 = dr_chain[j+length/2];
4688
4689 /* Create interleaving stmt:
4690 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4691 ...}> */
4692 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4693 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4694 vect2, perm_mask_high);
4695 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4696 (*result_chain)[2*j] = high;
4697
4698 /* Create interleaving stmt:
4699 low = VEC_PERM_EXPR <vect1, vect2,
4700 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4701 ...}> */
4702 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4703 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4704 vect2, perm_mask_low);
4705 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4706 (*result_chain)[2*j+1] = low;
4707 }
4708 memcpy (dr_chain.address (), result_chain->address (),
4709 length * sizeof (tree));
4710 }
4711 }
4712 }
4713
4714 /* Function vect_setup_realignment
4715
4716 This function is called when vectorizing an unaligned load using
4717 the dr_explicit_realign[_optimized] scheme.
4718 This function generates the following code at the loop prolog:
4719
4720 p = initial_addr;
4721 x msq_init = *(floor(p)); # prolog load
4722 realignment_token = call target_builtin;
4723 loop:
4724 x msq = phi (msq_init, ---)
4725
4726 The stmts marked with x are generated only for the case of
4727 dr_explicit_realign_optimized.
4728
4729 The code above sets up a new (vector) pointer, pointing to the first
4730 location accessed by STMT, and a "floor-aligned" load using that pointer.
4731 It also generates code to compute the "realignment-token" (if the relevant
4732 target hook was defined), and creates a phi-node at the loop-header bb
4733 whose arguments are the result of the prolog-load (created by this
4734 function) and the result of a load that takes place in the loop (to be
4735 created by the caller to this function).
4736
4737 For the case of dr_explicit_realign_optimized:
4738 The caller to this function uses the phi-result (msq) to create the
4739 realignment code inside the loop, and sets up the missing phi argument,
4740 as follows:
4741 loop:
4742 msq = phi (msq_init, lsq)
4743 lsq = *(floor(p')); # load in loop
4744 result = realign_load (msq, lsq, realignment_token);
4745
4746 For the case of dr_explicit_realign:
4747 loop:
4748 msq = *(floor(p)); # load in loop
4749 p' = p + (VS-1);
4750 lsq = *(floor(p')); # load in loop
4751 result = realign_load (msq, lsq, realignment_token);
4752
4753 Input:
4754 STMT - (scalar) load stmt to be vectorized. This load accesses
4755 a memory location that may be unaligned.
4756 BSI - place where new code is to be inserted.
4757 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4758 is used.
4759
4760 Output:
4761 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4762 target hook, if defined.
4763 Return value - the result of the loop-header phi node. */
4764
4765 tree
4766 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4767 tree *realignment_token,
4768 enum dr_alignment_support alignment_support_scheme,
4769 tree init_addr,
4770 struct loop **at_loop)
4771 {
4772 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4773 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4774 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4775 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4776 struct loop *loop = NULL;
4777 edge pe = NULL;
4778 tree scalar_dest = gimple_assign_lhs (stmt);
4779 tree vec_dest;
4780 gimple inc;
4781 tree ptr;
4782 tree data_ref;
4783 basic_block new_bb;
4784 tree msq_init = NULL_TREE;
4785 tree new_temp;
4786 gphi *phi_stmt;
4787 tree msq = NULL_TREE;
4788 gimple_seq stmts = NULL;
4789 bool inv_p;
4790 bool compute_in_loop = false;
4791 bool nested_in_vect_loop = false;
4792 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4793 struct loop *loop_for_initial_load = NULL;
4794
4795 if (loop_vinfo)
4796 {
4797 loop = LOOP_VINFO_LOOP (loop_vinfo);
4798 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4799 }
4800
4801 gcc_assert (alignment_support_scheme == dr_explicit_realign
4802 || alignment_support_scheme == dr_explicit_realign_optimized);
4803
4804 /* We need to generate three things:
4805 1. the misalignment computation
4806 2. the extra vector load (for the optimized realignment scheme).
4807 3. the phi node for the two vectors from which the realignment is
4808 done (for the optimized realignment scheme). */
4809
4810 /* 1. Determine where to generate the misalignment computation.
4811
4812 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4813 calculation will be generated by this function, outside the loop (in the
4814 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4815 caller, inside the loop.
4816
4817 Background: If the misalignment remains fixed throughout the iterations of
4818 the loop, then both realignment schemes are applicable, and also the
4819 misalignment computation can be done outside LOOP. This is because we are
4820 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4821 are a multiple of VS (the Vector Size), and therefore the misalignment in
4822 different vectorized LOOP iterations is always the same.
4823 The problem arises only if the memory access is in an inner-loop nested
4824 inside LOOP, which is now being vectorized using outer-loop vectorization.
4825 This is the only case when the misalignment of the memory access may not
4826 remain fixed throughout the iterations of the inner-loop (as explained in
4827 detail in vect_supportable_dr_alignment). In this case, not only is the
4828 optimized realignment scheme not applicable, but also the misalignment
4829 computation (and generation of the realignment token that is passed to
4830 REALIGN_LOAD) have to be done inside the loop.
4831
4832 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4833 or not, which in turn determines if the misalignment is computed inside
4834 the inner-loop, or outside LOOP. */
4835
4836 if (init_addr != NULL_TREE || !loop_vinfo)
4837 {
4838 compute_in_loop = true;
4839 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4840 }
4841
4842
4843 /* 2. Determine where to generate the extra vector load.
4844
4845 For the optimized realignment scheme, instead of generating two vector
4846 loads in each iteration, we generate a single extra vector load in the
4847 preheader of the loop, and in each iteration reuse the result of the
4848 vector load from the previous iteration. In case the memory access is in
4849 an inner-loop nested inside LOOP, which is now being vectorized using
4850 outer-loop vectorization, we need to determine whether this initial vector
4851 load should be generated at the preheader of the inner-loop, or can be
4852 generated at the preheader of LOOP. If the memory access has no evolution
4853 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4854 to be generated inside LOOP (in the preheader of the inner-loop). */
4855
4856 if (nested_in_vect_loop)
4857 {
4858 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4859 bool invariant_in_outerloop =
4860 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4861 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4862 }
4863 else
4864 loop_for_initial_load = loop;
4865 if (at_loop)
4866 *at_loop = loop_for_initial_load;
4867
4868 if (loop_for_initial_load)
4869 pe = loop_preheader_edge (loop_for_initial_load);
4870
4871 /* 3. For the case of the optimized realignment, create the first vector
4872 load at the loop preheader. */
4873
4874 if (alignment_support_scheme == dr_explicit_realign_optimized)
4875 {
4876 /* Create msq_init = *(floor(p1)) in the loop preheader */
4877 gassign *new_stmt;
4878
4879 gcc_assert (!compute_in_loop);
4880 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4881 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4882 NULL_TREE, &init_addr, NULL, &inc,
4883 true, &inv_p);
4884 new_temp = copy_ssa_name (ptr);
4885 new_stmt = gimple_build_assign
4886 (new_temp, BIT_AND_EXPR, ptr,
4887 build_int_cst (TREE_TYPE (ptr),
4888 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4889 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4890 gcc_assert (!new_bb);
4891 data_ref
4892 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4893 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4894 new_stmt = gimple_build_assign (vec_dest, data_ref);
4895 new_temp = make_ssa_name (vec_dest, new_stmt);
4896 gimple_assign_set_lhs (new_stmt, new_temp);
4897 if (pe)
4898 {
4899 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4900 gcc_assert (!new_bb);
4901 }
4902 else
4903 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4904
4905 msq_init = gimple_assign_lhs (new_stmt);
4906 }
4907
4908 /* 4. Create realignment token using a target builtin, if available.
4909 It is done either inside the containing loop, or before LOOP (as
4910 determined above). */
4911
4912 if (targetm.vectorize.builtin_mask_for_load)
4913 {
4914 gcall *new_stmt;
4915 tree builtin_decl;
4916
4917 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4918 if (!init_addr)
4919 {
4920 /* Generate the INIT_ADDR computation outside LOOP. */
4921 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4922 NULL_TREE, loop);
4923 if (loop)
4924 {
4925 pe = loop_preheader_edge (loop);
4926 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4927 gcc_assert (!new_bb);
4928 }
4929 else
4930 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4931 }
4932
4933 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4934 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4935 vec_dest =
4936 vect_create_destination_var (scalar_dest,
4937 gimple_call_return_type (new_stmt));
4938 new_temp = make_ssa_name (vec_dest, new_stmt);
4939 gimple_call_set_lhs (new_stmt, new_temp);
4940
4941 if (compute_in_loop)
4942 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4943 else
4944 {
4945 /* Generate the misalignment computation outside LOOP. */
4946 pe = loop_preheader_edge (loop);
4947 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4948 gcc_assert (!new_bb);
4949 }
4950
4951 *realignment_token = gimple_call_lhs (new_stmt);
4952
4953 /* The result of the CALL_EXPR to this builtin is determined from
4954 the value of the parameter and no global variables are touched
4955 which makes the builtin a "const" function. Requiring the
4956 builtin to have the "const" attribute makes it unnecessary
4957 to call mark_call_clobbered. */
4958 gcc_assert (TREE_READONLY (builtin_decl));
4959 }
4960
4961 if (alignment_support_scheme == dr_explicit_realign)
4962 return msq;
4963
4964 gcc_assert (!compute_in_loop);
4965 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4966
4967
4968 /* 5. Create msq = phi <msq_init, lsq> in loop */
4969
4970 pe = loop_preheader_edge (containing_loop);
4971 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4972 msq = make_ssa_name (vec_dest);
4973 phi_stmt = create_phi_node (msq, containing_loop->header);
4974 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4975
4976 return msq;
4977 }
4978
4979
4980 /* Function vect_grouped_load_supported.
4981
4982 Returns TRUE if even and odd permutations are supported,
4983 and FALSE otherwise. */
4984
4985 bool
4986 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4987 {
4988 machine_mode mode = TYPE_MODE (vectype);
4989
4990 /* vect_permute_load_chain requires the group size to be equal to 3 or
4991 be a power of two. */
4992 if (count != 3 && exact_log2 (count) == -1)
4993 {
4994 if (dump_enabled_p ())
4995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4996 "the size of the group of accesses"
4997 " is not a power of 2 or not equal to 3\n");
4998 return false;
4999 }
5000
5001 /* Check that the permutation is supported. */
5002 if (VECTOR_MODE_P (mode))
5003 {
5004 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5005 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5006
5007 if (count == 3)
5008 {
5009 unsigned int k;
5010 for (k = 0; k < 3; k++)
5011 {
5012 for (i = 0; i < nelt; i++)
5013 if (3 * i + k < 2 * nelt)
5014 sel[i] = 3 * i + k;
5015 else
5016 sel[i] = 0;
5017 if (!can_vec_perm_p (mode, false, sel))
5018 {
5019 if (dump_enabled_p ())
5020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5021 "shuffle of 3 loads is not supported by"
5022 " target\n");
5023 return false;
5024 }
5025 for (i = 0, j = 0; i < nelt; i++)
5026 if (3 * i + k < 2 * nelt)
5027 sel[i] = i;
5028 else
5029 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5030 if (!can_vec_perm_p (mode, false, sel))
5031 {
5032 if (dump_enabled_p ())
5033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5034 "shuffle of 3 loads is not supported by"
5035 " target\n");
5036 return false;
5037 }
5038 }
5039 return true;
5040 }
5041 else
5042 {
5043 /* If length is not equal to 3 then only power of 2 is supported. */
5044 gcc_assert (exact_log2 (count) != -1);
5045 for (i = 0; i < nelt; i++)
5046 sel[i] = i * 2;
5047 if (can_vec_perm_p (mode, false, sel))
5048 {
5049 for (i = 0; i < nelt; i++)
5050 sel[i] = i * 2 + 1;
5051 if (can_vec_perm_p (mode, false, sel))
5052 return true;
5053 }
5054 }
5055 }
5056
5057 if (dump_enabled_p ())
5058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059 "extract even/odd not supported by target\n");
5060 return false;
5061 }
5062
5063 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5064 type VECTYPE. */
5065
5066 bool
5067 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5068 {
5069 return vect_lanes_optab_supported_p ("vec_load_lanes",
5070 vec_load_lanes_optab,
5071 vectype, count);
5072 }
5073
5074 /* Function vect_permute_load_chain.
5075
5076 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5077 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5078 the input data correctly. Return the final references for loads in
5079 RESULT_CHAIN.
5080
5081 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5082 The input is 4 vectors each containing 8 elements. We assign a number to each
5083 element, the input sequence is:
5084
5085 1st vec: 0 1 2 3 4 5 6 7
5086 2nd vec: 8 9 10 11 12 13 14 15
5087 3rd vec: 16 17 18 19 20 21 22 23
5088 4th vec: 24 25 26 27 28 29 30 31
5089
5090 The output sequence should be:
5091
5092 1st vec: 0 4 8 12 16 20 24 28
5093 2nd vec: 1 5 9 13 17 21 25 29
5094 3rd vec: 2 6 10 14 18 22 26 30
5095 4th vec: 3 7 11 15 19 23 27 31
5096
5097 i.e., the first output vector should contain the first elements of each
5098 interleaving group, etc.
5099
5100 We use extract_even/odd instructions to create such output. The input of
5101 each extract_even/odd operation is two vectors
5102 1st vec 2nd vec
5103 0 1 2 3 4 5 6 7
5104
5105 and the output is the vector of extracted even/odd elements. The output of
5106 extract_even will be: 0 2 4 6
5107 and of extract_odd: 1 3 5 7
5108
5109
5110 The permutation is done in log LENGTH stages. In each stage extract_even
5111 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5112 their order. In our example,
5113
5114 E1: extract_even (1st vec, 2nd vec)
5115 E2: extract_odd (1st vec, 2nd vec)
5116 E3: extract_even (3rd vec, 4th vec)
5117 E4: extract_odd (3rd vec, 4th vec)
5118
5119 The output for the first stage will be:
5120
5121 E1: 0 2 4 6 8 10 12 14
5122 E2: 1 3 5 7 9 11 13 15
5123 E3: 16 18 20 22 24 26 28 30
5124 E4: 17 19 21 23 25 27 29 31
5125
5126 In order to proceed and create the correct sequence for the next stage (or
5127 for the correct output, if the second stage is the last one, as in our
5128 example), we first put the output of extract_even operation and then the
5129 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5130 The input for the second stage is:
5131
5132 1st vec (E1): 0 2 4 6 8 10 12 14
5133 2nd vec (E3): 16 18 20 22 24 26 28 30
5134 3rd vec (E2): 1 3 5 7 9 11 13 15
5135 4th vec (E4): 17 19 21 23 25 27 29 31
5136
5137 The output of the second stage:
5138
5139 E1: 0 4 8 12 16 20 24 28
5140 E2: 2 6 10 14 18 22 26 30
5141 E3: 1 5 9 13 17 21 25 29
5142 E4: 3 7 11 15 19 23 27 31
5143
5144 And RESULT_CHAIN after reordering:
5145
5146 1st vec (E1): 0 4 8 12 16 20 24 28
5147 2nd vec (E3): 1 5 9 13 17 21 25 29
5148 3rd vec (E2): 2 6 10 14 18 22 26 30
5149 4th vec (E4): 3 7 11 15 19 23 27 31. */
5150
5151 static void
5152 vect_permute_load_chain (vec<tree> dr_chain,
5153 unsigned int length,
5154 gimple stmt,
5155 gimple_stmt_iterator *gsi,
5156 vec<tree> *result_chain)
5157 {
5158 tree data_ref, first_vect, second_vect;
5159 tree perm_mask_even, perm_mask_odd;
5160 tree perm3_mask_low, perm3_mask_high;
5161 gimple perm_stmt;
5162 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5163 unsigned int i, j, log_length = exact_log2 (length);
5164 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5165 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5166
5167 result_chain->quick_grow (length);
5168 memcpy (result_chain->address (), dr_chain.address (),
5169 length * sizeof (tree));
5170
5171 if (length == 3)
5172 {
5173 unsigned int k;
5174
5175 for (k = 0; k < 3; k++)
5176 {
5177 for (i = 0; i < nelt; i++)
5178 if (3 * i + k < 2 * nelt)
5179 sel[i] = 3 * i + k;
5180 else
5181 sel[i] = 0;
5182 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5183
5184 for (i = 0, j = 0; i < nelt; i++)
5185 if (3 * i + k < 2 * nelt)
5186 sel[i] = i;
5187 else
5188 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5189
5190 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5191
5192 first_vect = dr_chain[0];
5193 second_vect = dr_chain[1];
5194
5195 /* Create interleaving stmt (low part of):
5196 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5197 ...}> */
5198 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5199 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5200 second_vect, perm3_mask_low);
5201 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5202
5203 /* Create interleaving stmt (high part of):
5204 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5205 ...}> */
5206 first_vect = data_ref;
5207 second_vect = dr_chain[2];
5208 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5209 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5210 second_vect, perm3_mask_high);
5211 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5212 (*result_chain)[k] = data_ref;
5213 }
5214 }
5215 else
5216 {
5217 /* If length is not equal to 3 then only power of 2 is supported. */
5218 gcc_assert (exact_log2 (length) != -1);
5219
5220 for (i = 0; i < nelt; ++i)
5221 sel[i] = i * 2;
5222 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5223
5224 for (i = 0; i < nelt; ++i)
5225 sel[i] = i * 2 + 1;
5226 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5227
5228 for (i = 0; i < log_length; i++)
5229 {
5230 for (j = 0; j < length; j += 2)
5231 {
5232 first_vect = dr_chain[j];
5233 second_vect = dr_chain[j+1];
5234
5235 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5236 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5237 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5238 first_vect, second_vect,
5239 perm_mask_even);
5240 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5241 (*result_chain)[j/2] = data_ref;
5242
5243 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5244 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5245 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5246 first_vect, second_vect,
5247 perm_mask_odd);
5248 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5249 (*result_chain)[j/2+length/2] = data_ref;
5250 }
5251 memcpy (dr_chain.address (), result_chain->address (),
5252 length * sizeof (tree));
5253 }
5254 }
5255 }
5256
5257 /* Function vect_shift_permute_load_chain.
5258
5259 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5260 sequence of stmts to reorder the input data accordingly.
5261 Return the final references for loads in RESULT_CHAIN.
5262 Return true if successed, false otherwise.
5263
5264 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5265 The input is 3 vectors each containing 8 elements. We assign a
5266 number to each element, the input sequence is:
5267
5268 1st vec: 0 1 2 3 4 5 6 7
5269 2nd vec: 8 9 10 11 12 13 14 15
5270 3rd vec: 16 17 18 19 20 21 22 23
5271
5272 The output sequence should be:
5273
5274 1st vec: 0 3 6 9 12 15 18 21
5275 2nd vec: 1 4 7 10 13 16 19 22
5276 3rd vec: 2 5 8 11 14 17 20 23
5277
5278 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5279
5280 First we shuffle all 3 vectors to get correct elements order:
5281
5282 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5283 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5284 3rd vec: (16 19 22) (17 20 23) (18 21)
5285
5286 Next we unite and shift vector 3 times:
5287
5288 1st step:
5289 shift right by 6 the concatenation of:
5290 "1st vec" and "2nd vec"
5291 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5292 "2nd vec" and "3rd vec"
5293 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5294 "3rd vec" and "1st vec"
5295 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5296 | New vectors |
5297
5298 So that now new vectors are:
5299
5300 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5301 2nd vec: (10 13) (16 19 22) (17 20 23)
5302 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5303
5304 2nd step:
5305 shift right by 5 the concatenation of:
5306 "1st vec" and "3rd vec"
5307 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5308 "2nd vec" and "1st vec"
5309 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5310 "3rd vec" and "2nd vec"
5311 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5312 | New vectors |
5313
5314 So that now new vectors are:
5315
5316 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5317 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5318 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5319
5320 3rd step:
5321 shift right by 5 the concatenation of:
5322 "1st vec" and "1st vec"
5323 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5324 shift right by 3 the concatenation of:
5325 "2nd vec" and "2nd vec"
5326 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5327 | New vectors |
5328
5329 So that now all vectors are READY:
5330 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5331 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5332 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5333
5334 This algorithm is faster than one in vect_permute_load_chain if:
5335 1. "shift of a concatination" is faster than general permutation.
5336 This is usually so.
5337 2. The TARGET machine can't execute vector instructions in parallel.
5338 This is because each step of the algorithm depends on previous.
5339 The algorithm in vect_permute_load_chain is much more parallel.
5340
5341 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5342 */
5343
5344 static bool
5345 vect_shift_permute_load_chain (vec<tree> dr_chain,
5346 unsigned int length,
5347 gimple stmt,
5348 gimple_stmt_iterator *gsi,
5349 vec<tree> *result_chain)
5350 {
5351 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5352 tree perm2_mask1, perm2_mask2, perm3_mask;
5353 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5354 gimple perm_stmt;
5355
5356 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5357 unsigned int i;
5358 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5359 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5360 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5361 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5362
5363 result_chain->quick_grow (length);
5364 memcpy (result_chain->address (), dr_chain.address (),
5365 length * sizeof (tree));
5366
5367 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5368 {
5369 unsigned int j, log_length = exact_log2 (length);
5370 for (i = 0; i < nelt / 2; ++i)
5371 sel[i] = i * 2;
5372 for (i = 0; i < nelt / 2; ++i)
5373 sel[nelt / 2 + i] = i * 2 + 1;
5374 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5375 {
5376 if (dump_enabled_p ())
5377 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5378 "shuffle of 2 fields structure is not \
5379 supported by target\n");
5380 return false;
5381 }
5382 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5383
5384 for (i = 0; i < nelt / 2; ++i)
5385 sel[i] = i * 2 + 1;
5386 for (i = 0; i < nelt / 2; ++i)
5387 sel[nelt / 2 + i] = i * 2;
5388 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5389 {
5390 if (dump_enabled_p ())
5391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5392 "shuffle of 2 fields structure is not \
5393 supported by target\n");
5394 return false;
5395 }
5396 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5397
5398 /* Generating permutation constant to shift all elements.
5399 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5400 for (i = 0; i < nelt; i++)
5401 sel[i] = nelt / 2 + i;
5402 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5403 {
5404 if (dump_enabled_p ())
5405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5406 "shift permutation is not supported by target\n");
5407 return false;
5408 }
5409 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5410
5411 /* Generating permutation constant to select vector from 2.
5412 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5413 for (i = 0; i < nelt / 2; i++)
5414 sel[i] = i;
5415 for (i = nelt / 2; i < nelt; i++)
5416 sel[i] = nelt + i;
5417 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5418 {
5419 if (dump_enabled_p ())
5420 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5421 "select is not supported by target\n");
5422 return false;
5423 }
5424 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5425
5426 for (i = 0; i < log_length; i++)
5427 {
5428 for (j = 0; j < length; j += 2)
5429 {
5430 first_vect = dr_chain[j];
5431 second_vect = dr_chain[j + 1];
5432
5433 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5434 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5435 first_vect, first_vect,
5436 perm2_mask1);
5437 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5438 vect[0] = data_ref;
5439
5440 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5441 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5442 second_vect, second_vect,
5443 perm2_mask2);
5444 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5445 vect[1] = data_ref;
5446
5447 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5448 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5449 vect[0], vect[1], shift1_mask);
5450 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5451 (*result_chain)[j/2 + length/2] = data_ref;
5452
5453 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5454 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5455 vect[0], vect[1], select_mask);
5456 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5457 (*result_chain)[j/2] = data_ref;
5458 }
5459 memcpy (dr_chain.address (), result_chain->address (),
5460 length * sizeof (tree));
5461 }
5462 return true;
5463 }
5464 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5465 {
5466 unsigned int k = 0, l = 0;
5467
5468 /* Generating permutation constant to get all elements in rigth order.
5469 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5470 for (i = 0; i < nelt; i++)
5471 {
5472 if (3 * k + (l % 3) >= nelt)
5473 {
5474 k = 0;
5475 l += (3 - (nelt % 3));
5476 }
5477 sel[i] = 3 * k + (l % 3);
5478 k++;
5479 }
5480 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5481 {
5482 if (dump_enabled_p ())
5483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5484 "shuffle of 3 fields structure is not \
5485 supported by target\n");
5486 return false;
5487 }
5488 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5489
5490 /* Generating permutation constant to shift all elements.
5491 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5492 for (i = 0; i < nelt; i++)
5493 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5494 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5495 {
5496 if (dump_enabled_p ())
5497 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5498 "shift permutation is not supported by target\n");
5499 return false;
5500 }
5501 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5502
5503 /* Generating permutation constant to shift all elements.
5504 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5505 for (i = 0; i < nelt; i++)
5506 sel[i] = 2 * (nelt / 3) + 1 + i;
5507 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5508 {
5509 if (dump_enabled_p ())
5510 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5511 "shift permutation is not supported by target\n");
5512 return false;
5513 }
5514 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5515
5516 /* Generating permutation constant to shift all elements.
5517 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5518 for (i = 0; i < nelt; i++)
5519 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5520 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5521 {
5522 if (dump_enabled_p ())
5523 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5524 "shift permutation is not supported by target\n");
5525 return false;
5526 }
5527 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5528
5529 /* Generating permutation constant to shift all elements.
5530 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5531 for (i = 0; i < nelt; i++)
5532 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5533 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5534 {
5535 if (dump_enabled_p ())
5536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5537 "shift permutation is not supported by target\n");
5538 return false;
5539 }
5540 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5541
5542 for (k = 0; k < 3; k++)
5543 {
5544 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5545 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5546 dr_chain[k], dr_chain[k],
5547 perm3_mask);
5548 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5549 vect[k] = data_ref;
5550 }
5551
5552 for (k = 0; k < 3; k++)
5553 {
5554 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5555 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5556 vect[k % 3], vect[(k + 1) % 3],
5557 shift1_mask);
5558 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5559 vect_shift[k] = data_ref;
5560 }
5561
5562 for (k = 0; k < 3; k++)
5563 {
5564 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5565 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5566 vect_shift[(4 - k) % 3],
5567 vect_shift[(3 - k) % 3],
5568 shift2_mask);
5569 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5570 vect[k] = data_ref;
5571 }
5572
5573 (*result_chain)[3 - (nelt % 3)] = vect[2];
5574
5575 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5576 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5577 vect[0], shift3_mask);
5578 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5579 (*result_chain)[nelt % 3] = data_ref;
5580
5581 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5582 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5583 vect[1], shift4_mask);
5584 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5585 (*result_chain)[0] = data_ref;
5586 return true;
5587 }
5588 return false;
5589 }
5590
5591 /* Function vect_transform_grouped_load.
5592
5593 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5594 to perform their permutation and ascribe the result vectorized statements to
5595 the scalar statements.
5596 */
5597
5598 void
5599 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5600 gimple_stmt_iterator *gsi)
5601 {
5602 machine_mode mode;
5603 vec<tree> result_chain = vNULL;
5604
5605 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5606 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5607 vectors, that are ready for vector computation. */
5608 result_chain.create (size);
5609
5610 /* If reassociation width for vector type is 2 or greater target machine can
5611 execute 2 or more vector instructions in parallel. Otherwise try to
5612 get chain for loads group using vect_shift_permute_load_chain. */
5613 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5614 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5615 || exact_log2 (size) != -1
5616 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5617 gsi, &result_chain))
5618 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5619 vect_record_grouped_load_vectors (stmt, result_chain);
5620 result_chain.release ();
5621 }
5622
5623 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5624 generated as part of the vectorization of STMT. Assign the statement
5625 for each vector to the associated scalar statement. */
5626
5627 void
5628 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5629 {
5630 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5631 gimple next_stmt, new_stmt;
5632 unsigned int i, gap_count;
5633 tree tmp_data_ref;
5634
5635 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5636 Since we scan the chain starting from it's first node, their order
5637 corresponds the order of data-refs in RESULT_CHAIN. */
5638 next_stmt = first_stmt;
5639 gap_count = 1;
5640 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5641 {
5642 if (!next_stmt)
5643 break;
5644
5645 /* Skip the gaps. Loads created for the gaps will be removed by dead
5646 code elimination pass later. No need to check for the first stmt in
5647 the group, since it always exists.
5648 GROUP_GAP is the number of steps in elements from the previous
5649 access (if there is no gap GROUP_GAP is 1). We skip loads that
5650 correspond to the gaps. */
5651 if (next_stmt != first_stmt
5652 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5653 {
5654 gap_count++;
5655 continue;
5656 }
5657
5658 while (next_stmt)
5659 {
5660 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5661 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5662 copies, and we put the new vector statement in the first available
5663 RELATED_STMT. */
5664 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5665 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5666 else
5667 {
5668 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5669 {
5670 gimple prev_stmt =
5671 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5672 gimple rel_stmt =
5673 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5674 while (rel_stmt)
5675 {
5676 prev_stmt = rel_stmt;
5677 rel_stmt =
5678 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5679 }
5680
5681 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5682 new_stmt;
5683 }
5684 }
5685
5686 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5687 gap_count = 1;
5688 /* If NEXT_STMT accesses the same DR as the previous statement,
5689 put the same TMP_DATA_REF as its vectorized statement; otherwise
5690 get the next data-ref from RESULT_CHAIN. */
5691 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5692 break;
5693 }
5694 }
5695 }
5696
5697 /* Function vect_force_dr_alignment_p.
5698
5699 Returns whether the alignment of a DECL can be forced to be aligned
5700 on ALIGNMENT bit boundary. */
5701
5702 bool
5703 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5704 {
5705 if (TREE_CODE (decl) != VAR_DECL)
5706 return false;
5707
5708 /* With -fno-toplevel-reorder we may have already output the constant. */
5709 if (TREE_ASM_WRITTEN (decl))
5710 return false;
5711
5712 /* Constant pool entries may be shared and not properly merged by LTO. */
5713 if (DECL_IN_CONSTANT_POOL (decl))
5714 return false;
5715
5716 if (TREE_PUBLIC (decl) || DECL_EXTERNAL (decl))
5717 {
5718 symtab_node *snode;
5719
5720 /* We cannot change alignment of symbols that may bind to symbols
5721 in other translation unit that may contain a definition with lower
5722 alignment. */
5723 if (!decl_binds_to_current_def_p (decl))
5724 return false;
5725
5726 /* When compiling partition, be sure the symbol is not output by other
5727 partition. */
5728 snode = symtab_node::get (decl);
5729 if (flag_ltrans
5730 && (snode->in_other_partition
5731 || snode->get_partitioning_class () == SYMBOL_DUPLICATE))
5732 return false;
5733 }
5734
5735 /* Do not override the alignment as specified by the ABI when the used
5736 attribute is set. */
5737 if (DECL_PRESERVE_P (decl))
5738 return false;
5739
5740 /* Do not override explicit alignment set by the user when an explicit
5741 section name is also used. This is a common idiom used by many
5742 software projects. */
5743 if (TREE_STATIC (decl)
5744 && DECL_SECTION_NAME (decl) != NULL
5745 && !symtab_node::get (decl)->implicit_section)
5746 return false;
5747
5748 /* If symbol is an alias, we need to check that target is OK. */
5749 if (TREE_STATIC (decl))
5750 {
5751 tree target = symtab_node::get (decl)->ultimate_alias_target ()->decl;
5752 if (target != decl)
5753 {
5754 if (DECL_PRESERVE_P (target))
5755 return false;
5756 decl = target;
5757 }
5758 }
5759
5760 if (TREE_STATIC (decl))
5761 return (alignment <= MAX_OFILE_ALIGNMENT);
5762 else
5763 return (alignment <= MAX_STACK_ALIGNMENT);
5764 }
5765
5766
5767 /* Return whether the data reference DR is supported with respect to its
5768 alignment.
5769 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5770 it is aligned, i.e., check if it is possible to vectorize it with different
5771 alignment. */
5772
5773 enum dr_alignment_support
5774 vect_supportable_dr_alignment (struct data_reference *dr,
5775 bool check_aligned_accesses)
5776 {
5777 gimple stmt = DR_STMT (dr);
5778 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5779 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5780 machine_mode mode = TYPE_MODE (vectype);
5781 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5782 struct loop *vect_loop = NULL;
5783 bool nested_in_vect_loop = false;
5784
5785 if (aligned_access_p (dr) && !check_aligned_accesses)
5786 return dr_aligned;
5787
5788 /* For now assume all conditional loads/stores support unaligned
5789 access without any special code. */
5790 if (is_gimple_call (stmt)
5791 && gimple_call_internal_p (stmt)
5792 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5793 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5794 return dr_unaligned_supported;
5795
5796 if (loop_vinfo)
5797 {
5798 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5799 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5800 }
5801
5802 /* Possibly unaligned access. */
5803
5804 /* We can choose between using the implicit realignment scheme (generating
5805 a misaligned_move stmt) and the explicit realignment scheme (generating
5806 aligned loads with a REALIGN_LOAD). There are two variants to the
5807 explicit realignment scheme: optimized, and unoptimized.
5808 We can optimize the realignment only if the step between consecutive
5809 vector loads is equal to the vector size. Since the vector memory
5810 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5811 is guaranteed that the misalignment amount remains the same throughout the
5812 execution of the vectorized loop. Therefore, we can create the
5813 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5814 at the loop preheader.
5815
5816 However, in the case of outer-loop vectorization, when vectorizing a
5817 memory access in the inner-loop nested within the LOOP that is now being
5818 vectorized, while it is guaranteed that the misalignment of the
5819 vectorized memory access will remain the same in different outer-loop
5820 iterations, it is *not* guaranteed that is will remain the same throughout
5821 the execution of the inner-loop. This is because the inner-loop advances
5822 with the original scalar step (and not in steps of VS). If the inner-loop
5823 step happens to be a multiple of VS, then the misalignment remains fixed
5824 and we can use the optimized realignment scheme. For example:
5825
5826 for (i=0; i<N; i++)
5827 for (j=0; j<M; j++)
5828 s += a[i+j];
5829
5830 When vectorizing the i-loop in the above example, the step between
5831 consecutive vector loads is 1, and so the misalignment does not remain
5832 fixed across the execution of the inner-loop, and the realignment cannot
5833 be optimized (as illustrated in the following pseudo vectorized loop):
5834
5835 for (i=0; i<N; i+=4)
5836 for (j=0; j<M; j++){
5837 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5838 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5839 // (assuming that we start from an aligned address).
5840 }
5841
5842 We therefore have to use the unoptimized realignment scheme:
5843
5844 for (i=0; i<N; i+=4)
5845 for (j=k; j<M; j+=4)
5846 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5847 // that the misalignment of the initial address is
5848 // 0).
5849
5850 The loop can then be vectorized as follows:
5851
5852 for (k=0; k<4; k++){
5853 rt = get_realignment_token (&vp[k]);
5854 for (i=0; i<N; i+=4){
5855 v1 = vp[i+k];
5856 for (j=k; j<M; j+=4){
5857 v2 = vp[i+j+VS-1];
5858 va = REALIGN_LOAD <v1,v2,rt>;
5859 vs += va;
5860 v1 = v2;
5861 }
5862 }
5863 } */
5864
5865 if (DR_IS_READ (dr))
5866 {
5867 bool is_packed = false;
5868 tree type = (TREE_TYPE (DR_REF (dr)));
5869
5870 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5871 && (!targetm.vectorize.builtin_mask_for_load
5872 || targetm.vectorize.builtin_mask_for_load ()))
5873 {
5874 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5875 if ((nested_in_vect_loop
5876 && (TREE_INT_CST_LOW (DR_STEP (dr))
5877 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5878 || !loop_vinfo)
5879 return dr_explicit_realign;
5880 else
5881 return dr_explicit_realign_optimized;
5882 }
5883 if (!known_alignment_for_access_p (dr))
5884 is_packed = not_size_aligned (DR_REF (dr));
5885
5886 if ((TYPE_USER_ALIGN (type) && !is_packed)
5887 || targetm.vectorize.
5888 support_vector_misalignment (mode, type,
5889 DR_MISALIGNMENT (dr), is_packed))
5890 /* Can't software pipeline the loads, but can at least do them. */
5891 return dr_unaligned_supported;
5892 }
5893 else
5894 {
5895 bool is_packed = false;
5896 tree type = (TREE_TYPE (DR_REF (dr)));
5897
5898 if (!known_alignment_for_access_p (dr))
5899 is_packed = not_size_aligned (DR_REF (dr));
5900
5901 if ((TYPE_USER_ALIGN (type) && !is_packed)
5902 || targetm.vectorize.
5903 support_vector_misalignment (mode, type,
5904 DR_MISALIGNMENT (dr), is_packed))
5905 return dr_unaligned_supported;
5906 }
5907
5908 /* Unsupported. */
5909 return dr_unaligned_unsupported;
5910 }