re PR tree-optimization/65450 (Unaligned access with -O3 -mtune=k8)
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tm_p.h"
40 #include "target.h"
41 #include "predict.h"
42 #include "hard-reg-set.h"
43 #include "function.h"
44 #include "dominance.h"
45 #include "cfg.h"
46 #include "basic-block.h"
47 #include "gimple-pretty-print.h"
48 #include "tree-ssa-alias.h"
49 #include "internal-fn.h"
50 #include "tree-eh.h"
51 #include "gimple-expr.h"
52 #include "is-a.h"
53 #include "gimple.h"
54 #include "gimplify.h"
55 #include "gimple-iterator.h"
56 #include "gimplify-me.h"
57 #include "gimple-ssa.h"
58 #include "tree-phinodes.h"
59 #include "ssa-iterators.h"
60 #include "stringpool.h"
61 #include "tree-ssanames.h"
62 #include "tree-ssa-loop-ivopts.h"
63 #include "tree-ssa-loop-manip.h"
64 #include "tree-ssa-loop.h"
65 #include "cfgloop.h"
66 #include "tree-chrec.h"
67 #include "tree-scalar-evolution.h"
68 #include "tree-vectorizer.h"
69 #include "diagnostic-core.h"
70 #include "hash-map.h"
71 #include "plugin-api.h"
72 #include "ipa-ref.h"
73 #include "cgraph.h"
74 /* Need to include rtl.h, expr.h, etc. for optabs. */
75 #include "hashtab.h"
76 #include "rtl.h"
77 #include "flags.h"
78 #include "statistics.h"
79 #include "real.h"
80 #include "fixed-value.h"
81 #include "insn-config.h"
82 #include "expmed.h"
83 #include "dojump.h"
84 #include "explow.h"
85 #include "calls.h"
86 #include "emit-rtl.h"
87 #include "varasm.h"
88 #include "stmt.h"
89 #include "expr.h"
90 #include "insn-codes.h"
91 #include "optabs.h"
92 #include "builtins.h"
93
94 /* Return true if load- or store-lanes optab OPTAB is implemented for
95 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
96
97 static bool
98 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
99 tree vectype, unsigned HOST_WIDE_INT count)
100 {
101 machine_mode mode, array_mode;
102 bool limit_p;
103
104 mode = TYPE_MODE (vectype);
105 limit_p = !targetm.array_mode_supported_p (mode, count);
106 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
107 MODE_INT, limit_p);
108
109 if (array_mode == BLKmode)
110 {
111 if (dump_enabled_p ())
112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
113 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
114 GET_MODE_NAME (mode), count);
115 return false;
116 }
117
118 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
119 {
120 if (dump_enabled_p ())
121 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
122 "cannot use %s<%s><%s>\n", name,
123 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
124 return false;
125 }
126
127 if (dump_enabled_p ())
128 dump_printf_loc (MSG_NOTE, vect_location,
129 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
130 GET_MODE_NAME (mode));
131
132 return true;
133 }
134
135
136 /* Return the smallest scalar part of STMT.
137 This is used to determine the vectype of the stmt. We generally set the
138 vectype according to the type of the result (lhs). For stmts whose
139 result-type is different than the type of the arguments (e.g., demotion,
140 promotion), vectype will be reset appropriately (later). Note that we have
141 to visit the smallest datatype in this function, because that determines the
142 VF. If the smallest datatype in the loop is present only as the rhs of a
143 promotion operation - we'd miss it.
144 Such a case, where a variable of this datatype does not appear in the lhs
145 anywhere in the loop, can only occur if it's an invariant: e.g.:
146 'int_x = (int) short_inv', which we'd expect to have been optimized away by
147 invariant motion. However, we cannot rely on invariant motion to always
148 take invariants out of the loop, and so in the case of promotion we also
149 have to check the rhs.
150 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
151 types. */
152
153 tree
154 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
155 HOST_WIDE_INT *rhs_size_unit)
156 {
157 tree scalar_type = gimple_expr_type (stmt);
158 HOST_WIDE_INT lhs, rhs;
159
160 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
161
162 if (is_gimple_assign (stmt)
163 && (gimple_assign_cast_p (stmt)
164 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
165 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
166 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
167 {
168 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
169
170 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
171 if (rhs < lhs)
172 scalar_type = rhs_type;
173 }
174
175 *lhs_size_unit = lhs;
176 *rhs_size_unit = rhs;
177 return scalar_type;
178 }
179
180
181 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
182 tested at run-time. Return TRUE if DDR was successfully inserted.
183 Return false if versioning is not supported. */
184
185 static bool
186 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
187 {
188 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
189
190 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
191 return false;
192
193 if (dump_enabled_p ())
194 {
195 dump_printf_loc (MSG_NOTE, vect_location,
196 "mark for run-time aliasing test between ");
197 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
198 dump_printf (MSG_NOTE, " and ");
199 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
200 dump_printf (MSG_NOTE, "\n");
201 }
202
203 if (optimize_loop_nest_for_size_p (loop))
204 {
205 if (dump_enabled_p ())
206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
207 "versioning not supported when optimizing"
208 " for size.\n");
209 return false;
210 }
211
212 /* FORNOW: We don't support versioning with outer-loop vectorization. */
213 if (loop->inner)
214 {
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not yet supported for outer-loops.\n");
218 return false;
219 }
220
221 /* FORNOW: We don't support creating runtime alias tests for non-constant
222 step. */
223 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
224 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
225 {
226 if (dump_enabled_p ())
227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
228 "versioning not yet supported for non-constant "
229 "step\n");
230 return false;
231 }
232
233 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
234 return true;
235 }
236
237
238 /* Function vect_analyze_data_ref_dependence.
239
240 Return TRUE if there (might) exist a dependence between a memory-reference
241 DRA and a memory-reference DRB. When versioning for alias may check a
242 dependence at run-time, return FALSE. Adjust *MAX_VF according to
243 the data dependence. */
244
245 static bool
246 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
247 loop_vec_info loop_vinfo, int *max_vf)
248 {
249 unsigned int i;
250 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
251 struct data_reference *dra = DDR_A (ddr);
252 struct data_reference *drb = DDR_B (ddr);
253 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
254 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
255 lambda_vector dist_v;
256 unsigned int loop_depth;
257
258 /* In loop analysis all data references should be vectorizable. */
259 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
260 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
261 gcc_unreachable ();
262
263 /* Independent data accesses. */
264 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
265 return false;
266
267 if (dra == drb
268 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
269 return false;
270
271 /* Even if we have an anti-dependence then, as the vectorized loop covers at
272 least two scalar iterations, there is always also a true dependence.
273 As the vectorizer does not re-order loads and stores we can ignore
274 the anti-dependence if TBAA can disambiguate both DRs similar to the
275 case with known negative distance anti-dependences (positive
276 distance anti-dependences would violate TBAA constraints). */
277 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
278 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
279 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
280 get_alias_set (DR_REF (drb))))
281 return false;
282
283 /* Unknown data dependence. */
284 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
285 {
286 /* If user asserted safelen consecutive iterations can be
287 executed concurrently, assume independence. */
288 if (loop->safelen >= 2)
289 {
290 if (loop->safelen < *max_vf)
291 *max_vf = loop->safelen;
292 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
293 return false;
294 }
295
296 if (STMT_VINFO_GATHER_P (stmtinfo_a)
297 || STMT_VINFO_GATHER_P (stmtinfo_b))
298 {
299 if (dump_enabled_p ())
300 {
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias not supported for: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
310 }
311 return true;
312 }
313
314 if (dump_enabled_p ())
315 {
316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
317 "versioning for alias required: "
318 "can't determine dependence between ");
319 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
320 DR_REF (dra));
321 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
322 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
323 DR_REF (drb));
324 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
325 }
326
327 /* Add to list of ddrs that need to be tested at run-time. */
328 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
329 }
330
331 /* Known data dependence. */
332 if (DDR_NUM_DIST_VECTS (ddr) == 0)
333 {
334 /* If user asserted safelen consecutive iterations can be
335 executed concurrently, assume independence. */
336 if (loop->safelen >= 2)
337 {
338 if (loop->safelen < *max_vf)
339 *max_vf = loop->safelen;
340 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
341 return false;
342 }
343
344 if (STMT_VINFO_GATHER_P (stmtinfo_a)
345 || STMT_VINFO_GATHER_P (stmtinfo_b))
346 {
347 if (dump_enabled_p ())
348 {
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 "versioning for alias not supported for: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
353 DR_REF (dra));
354 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
355 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
356 DR_REF (drb));
357 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
358 }
359 return true;
360 }
361
362 if (dump_enabled_p ())
363 {
364 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
365 "versioning for alias required: "
366 "bad dist vector for ");
367 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
368 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
369 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
370 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
371 }
372 /* Add to list of ddrs that need to be tested at run-time. */
373 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
374 }
375
376 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
377 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
378 {
379 int dist = dist_v[loop_depth];
380
381 if (dump_enabled_p ())
382 dump_printf_loc (MSG_NOTE, vect_location,
383 "dependence distance = %d.\n", dist);
384
385 if (dist == 0)
386 {
387 if (dump_enabled_p ())
388 {
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "dependence distance == 0 between ");
391 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
392 dump_printf (MSG_NOTE, " and ");
393 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
394 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
395 }
396
397 /* When we perform grouped accesses and perform implicit CSE
398 by detecting equal accesses and doing disambiguation with
399 runtime alias tests like for
400 .. = a[i];
401 .. = a[i+1];
402 a[i] = ..;
403 a[i+1] = ..;
404 *p = ..;
405 .. = a[i];
406 .. = a[i+1];
407 where we will end up loading { a[i], a[i+1] } once, make
408 sure that inserting group loads before the first load and
409 stores after the last store will do the right thing.
410 Similar for groups like
411 a[i] = ...;
412 ... = a[i];
413 a[i+1] = ...;
414 where loads from the group interleave with the store. */
415 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
416 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
417 {
418 gimple earlier_stmt;
419 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
420 if (DR_IS_WRITE
421 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
422 {
423 if (dump_enabled_p ())
424 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
425 "READ_WRITE dependence in interleaving."
426 "\n");
427 return true;
428 }
429 }
430
431 continue;
432 }
433
434 if (dist > 0 && DDR_REVERSED_P (ddr))
435 {
436 /* If DDR_REVERSED_P the order of the data-refs in DDR was
437 reversed (to make distance vector positive), and the actual
438 distance is negative. */
439 if (dump_enabled_p ())
440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
441 "dependence distance negative.\n");
442 /* Record a negative dependence distance to later limit the
443 amount of stmt copying / unrolling we can perform.
444 Only need to handle read-after-write dependence. */
445 if (DR_IS_READ (drb)
446 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
447 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
448 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
449 continue;
450 }
451
452 if (abs (dist) >= 2
453 && abs (dist) < *max_vf)
454 {
455 /* The dependence distance requires reduction of the maximal
456 vectorization factor. */
457 *max_vf = abs (dist);
458 if (dump_enabled_p ())
459 dump_printf_loc (MSG_NOTE, vect_location,
460 "adjusting maximal vectorization factor to %i\n",
461 *max_vf);
462 }
463
464 if (abs (dist) >= *max_vf)
465 {
466 /* Dependence distance does not create dependence, as far as
467 vectorization is concerned, in this case. */
468 if (dump_enabled_p ())
469 dump_printf_loc (MSG_NOTE, vect_location,
470 "dependence distance >= VF.\n");
471 continue;
472 }
473
474 if (dump_enabled_p ())
475 {
476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
477 "not vectorized, possible dependence "
478 "between data-refs ");
479 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
480 dump_printf (MSG_NOTE, " and ");
481 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
482 dump_printf (MSG_NOTE, "\n");
483 }
484
485 return true;
486 }
487
488 return false;
489 }
490
491 /* Function vect_analyze_data_ref_dependences.
492
493 Examine all the data references in the loop, and make sure there do not
494 exist any data dependences between them. Set *MAX_VF according to
495 the maximum vectorization factor the data dependences allow. */
496
497 bool
498 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
499 {
500 unsigned int i;
501 struct data_dependence_relation *ddr;
502
503 if (dump_enabled_p ())
504 dump_printf_loc (MSG_NOTE, vect_location,
505 "=== vect_analyze_data_ref_dependences ===\n");
506
507 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
508 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
509 &LOOP_VINFO_DDRS (loop_vinfo),
510 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
511 return false;
512
513 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
514 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
515 return false;
516
517 return true;
518 }
519
520
521 /* Function vect_slp_analyze_data_ref_dependence.
522
523 Return TRUE if there (might) exist a dependence between a memory-reference
524 DRA and a memory-reference DRB. When versioning for alias may check a
525 dependence at run-time, return FALSE. Adjust *MAX_VF according to
526 the data dependence. */
527
528 static bool
529 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
530 {
531 struct data_reference *dra = DDR_A (ddr);
532 struct data_reference *drb = DDR_B (ddr);
533
534 /* We need to check dependences of statements marked as unvectorizable
535 as well, they still can prohibit vectorization. */
536
537 /* Independent data accesses. */
538 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
539 return false;
540
541 if (dra == drb)
542 return false;
543
544 /* Read-read is OK. */
545 if (DR_IS_READ (dra) && DR_IS_READ (drb))
546 return false;
547
548 /* If dra and drb are part of the same interleaving chain consider
549 them independent. */
550 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
551 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
552 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
553 return false;
554
555 /* Unknown data dependence. */
556 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
557 {
558 if (dump_enabled_p ())
559 {
560 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
561 "can't determine dependence between ");
562 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
563 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
564 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
565 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
566 }
567 }
568 else if (dump_enabled_p ())
569 {
570 dump_printf_loc (MSG_NOTE, vect_location,
571 "determined dependence between ");
572 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
573 dump_printf (MSG_NOTE, " and ");
574 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
575 dump_printf (MSG_NOTE, "\n");
576 }
577
578 /* We do not vectorize basic blocks with write-write dependencies. */
579 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
580 return true;
581
582 /* If we have a read-write dependence check that the load is before the store.
583 When we vectorize basic blocks, vector load can be only before
584 corresponding scalar load, and vector store can be only after its
585 corresponding scalar store. So the order of the acceses is preserved in
586 case the load is before the store. */
587 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
588 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
589 {
590 /* That only holds for load-store pairs taking part in vectorization. */
591 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
592 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
593 return false;
594 }
595
596 return true;
597 }
598
599
600 /* Function vect_analyze_data_ref_dependences.
601
602 Examine all the data references in the basic-block, and make sure there
603 do not exist any data dependences between them. Set *MAX_VF according to
604 the maximum vectorization factor the data dependences allow. */
605
606 bool
607 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
608 {
609 struct data_dependence_relation *ddr;
610 unsigned int i;
611
612 if (dump_enabled_p ())
613 dump_printf_loc (MSG_NOTE, vect_location,
614 "=== vect_slp_analyze_data_ref_dependences ===\n");
615
616 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
617 &BB_VINFO_DDRS (bb_vinfo),
618 vNULL, true))
619 return false;
620
621 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
622 if (vect_slp_analyze_data_ref_dependence (ddr))
623 return false;
624
625 return true;
626 }
627
628
629 /* Function vect_compute_data_ref_alignment
630
631 Compute the misalignment of the data reference DR.
632
633 Output:
634 1. If during the misalignment computation it is found that the data reference
635 cannot be vectorized then false is returned.
636 2. DR_MISALIGNMENT (DR) is defined.
637
638 FOR NOW: No analysis is actually performed. Misalignment is calculated
639 only for trivial cases. TODO. */
640
641 static bool
642 vect_compute_data_ref_alignment (struct data_reference *dr)
643 {
644 gimple stmt = DR_STMT (dr);
645 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
646 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
647 struct loop *loop = NULL;
648 tree ref = DR_REF (dr);
649 tree vectype;
650 tree base, base_addr;
651 bool base_aligned;
652 tree misalign;
653 tree aligned_to;
654 unsigned HOST_WIDE_INT alignment;
655
656 if (dump_enabled_p ())
657 dump_printf_loc (MSG_NOTE, vect_location,
658 "vect_compute_data_ref_alignment:\n");
659
660 if (loop_vinfo)
661 loop = LOOP_VINFO_LOOP (loop_vinfo);
662
663 /* Initialize misalignment to unknown. */
664 SET_DR_MISALIGNMENT (dr, -1);
665
666 /* Strided loads perform only component accesses, misalignment information
667 is irrelevant for them. */
668 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
669 return true;
670
671 misalign = DR_INIT (dr);
672 aligned_to = DR_ALIGNED_TO (dr);
673 base_addr = DR_BASE_ADDRESS (dr);
674 vectype = STMT_VINFO_VECTYPE (stmt_info);
675
676 /* In case the dataref is in an inner-loop of the loop that is being
677 vectorized (LOOP), we use the base and misalignment information
678 relative to the outer-loop (LOOP). This is ok only if the misalignment
679 stays the same throughout the execution of the inner-loop, which is why
680 we have to check that the stride of the dataref in the inner-loop evenly
681 divides by the vector size. */
682 if (loop && nested_in_vect_loop_p (loop, stmt))
683 {
684 tree step = DR_STEP (dr);
685 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
686
687 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
688 {
689 if (dump_enabled_p ())
690 dump_printf_loc (MSG_NOTE, vect_location,
691 "inner step divides the vector-size.\n");
692 misalign = STMT_VINFO_DR_INIT (stmt_info);
693 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
694 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
695 }
696 else
697 {
698 if (dump_enabled_p ())
699 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
700 "inner step doesn't divide the vector-size.\n");
701 misalign = NULL_TREE;
702 }
703 }
704
705 /* Similarly, if we're doing basic-block vectorization, we can only use
706 base and misalignment information relative to an innermost loop if the
707 misalignment stays the same throughout the execution of the loop.
708 As above, this is the case if the stride of the dataref evenly divides
709 by the vector size. */
710 if (!loop)
711 {
712 tree step = DR_STEP (dr);
713 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
714
715 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
716 {
717 if (dump_enabled_p ())
718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
719 "SLP: step doesn't divide the vector-size.\n");
720 misalign = NULL_TREE;
721 }
722 }
723
724 alignment = TYPE_ALIGN_UNIT (vectype);
725
726 if ((compare_tree_int (aligned_to, alignment) < 0)
727 || !misalign)
728 {
729 if (dump_enabled_p ())
730 {
731 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
732 "Unknown alignment for access: ");
733 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
734 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
735 }
736 return true;
737 }
738
739 /* To look at alignment of the base we have to preserve an inner MEM_REF
740 as that carries alignment information of the actual access. */
741 base = ref;
742 while (handled_component_p (base))
743 base = TREE_OPERAND (base, 0);
744 if (TREE_CODE (base) == MEM_REF)
745 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
746 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
747
748 if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
749 base_aligned = true;
750 else
751 base_aligned = false;
752
753 if (!base_aligned)
754 {
755 /* Strip an inner MEM_REF to a bare decl if possible. */
756 if (TREE_CODE (base) == MEM_REF
757 && integer_zerop (TREE_OPERAND (base, 1))
758 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
759 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
760
761 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
762 {
763 if (dump_enabled_p ())
764 {
765 dump_printf_loc (MSG_NOTE, vect_location,
766 "can't force alignment of ref: ");
767 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
768 dump_printf (MSG_NOTE, "\n");
769 }
770 return true;
771 }
772
773 /* Force the alignment of the decl.
774 NOTE: This is the only change to the code we make during
775 the analysis phase, before deciding to vectorize the loop. */
776 if (dump_enabled_p ())
777 {
778 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
779 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
780 dump_printf (MSG_NOTE, "\n");
781 }
782
783 ((dataref_aux *)dr->aux)->base_decl = base;
784 ((dataref_aux *)dr->aux)->base_misaligned = true;
785 }
786
787 /* If this is a backward running DR then first access in the larger
788 vectype actually is N-1 elements before the address in the DR.
789 Adjust misalign accordingly. */
790 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
791 {
792 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
793 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
794 otherwise we wouldn't be here. */
795 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
796 /* PLUS because DR_STEP was negative. */
797 misalign = size_binop (PLUS_EXPR, misalign, offset);
798 }
799
800 SET_DR_MISALIGNMENT (dr,
801 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
802
803 if (dump_enabled_p ())
804 {
805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
806 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
807 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
808 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
809 }
810
811 return true;
812 }
813
814
815 /* Function vect_compute_data_refs_alignment
816
817 Compute the misalignment of data references in the loop.
818 Return FALSE if a data reference is found that cannot be vectorized. */
819
820 static bool
821 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
822 bb_vec_info bb_vinfo)
823 {
824 vec<data_reference_p> datarefs;
825 struct data_reference *dr;
826 unsigned int i;
827
828 if (loop_vinfo)
829 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
830 else
831 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
832
833 FOR_EACH_VEC_ELT (datarefs, i, dr)
834 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
835 && !vect_compute_data_ref_alignment (dr))
836 {
837 if (bb_vinfo)
838 {
839 /* Mark unsupported statement as unvectorizable. */
840 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
841 continue;
842 }
843 else
844 return false;
845 }
846
847 return true;
848 }
849
850
851 /* Function vect_update_misalignment_for_peel
852
853 DR - the data reference whose misalignment is to be adjusted.
854 DR_PEEL - the data reference whose misalignment is being made
855 zero in the vector loop by the peel.
856 NPEEL - the number of iterations in the peel loop if the misalignment
857 of DR_PEEL is known at compile time. */
858
859 static void
860 vect_update_misalignment_for_peel (struct data_reference *dr,
861 struct data_reference *dr_peel, int npeel)
862 {
863 unsigned int i;
864 vec<dr_p> same_align_drs;
865 struct data_reference *current_dr;
866 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
867 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
868 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
869 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
870
871 /* For interleaved data accesses the step in the loop must be multiplied by
872 the size of the interleaving group. */
873 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
874 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
875 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
876 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
877
878 /* It can be assumed that the data refs with the same alignment as dr_peel
879 are aligned in the vector loop. */
880 same_align_drs
881 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
882 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
883 {
884 if (current_dr != dr)
885 continue;
886 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
887 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
888 SET_DR_MISALIGNMENT (dr, 0);
889 return;
890 }
891
892 if (known_alignment_for_access_p (dr)
893 && known_alignment_for_access_p (dr_peel))
894 {
895 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
896 int misal = DR_MISALIGNMENT (dr);
897 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
898 misal += negative ? -npeel * dr_size : npeel * dr_size;
899 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
900 SET_DR_MISALIGNMENT (dr, misal);
901 return;
902 }
903
904 if (dump_enabled_p ())
905 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
906 SET_DR_MISALIGNMENT (dr, -1);
907 }
908
909
910 /* Function vect_verify_datarefs_alignment
911
912 Return TRUE if all data references in the loop can be
913 handled with respect to alignment. */
914
915 bool
916 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
917 {
918 vec<data_reference_p> datarefs;
919 struct data_reference *dr;
920 enum dr_alignment_support supportable_dr_alignment;
921 unsigned int i;
922
923 if (loop_vinfo)
924 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
925 else
926 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
927
928 FOR_EACH_VEC_ELT (datarefs, i, dr)
929 {
930 gimple stmt = DR_STMT (dr);
931 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
932
933 if (!STMT_VINFO_RELEVANT_P (stmt_info))
934 continue;
935
936 /* For interleaving, only the alignment of the first access matters.
937 Skip statements marked as not vectorizable. */
938 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
939 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
940 || !STMT_VINFO_VECTORIZABLE (stmt_info))
941 continue;
942
943 /* Strided loads perform only component accesses, alignment is
944 irrelevant for them. */
945 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
946 continue;
947
948 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
949 if (!supportable_dr_alignment)
950 {
951 if (dump_enabled_p ())
952 {
953 if (DR_IS_READ (dr))
954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
955 "not vectorized: unsupported unaligned load.");
956 else
957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
958 "not vectorized: unsupported unaligned "
959 "store.");
960
961 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
962 DR_REF (dr));
963 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
964 }
965 return false;
966 }
967 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
968 dump_printf_loc (MSG_NOTE, vect_location,
969 "Vectorizing an unaligned access.\n");
970 }
971 return true;
972 }
973
974 /* Given an memory reference EXP return whether its alignment is less
975 than its size. */
976
977 static bool
978 not_size_aligned (tree exp)
979 {
980 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
981 return true;
982
983 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
984 > get_object_alignment (exp));
985 }
986
987 /* Function vector_alignment_reachable_p
988
989 Return true if vector alignment for DR is reachable by peeling
990 a few loop iterations. Return false otherwise. */
991
992 static bool
993 vector_alignment_reachable_p (struct data_reference *dr)
994 {
995 gimple stmt = DR_STMT (dr);
996 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
997 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
998
999 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1000 {
1001 /* For interleaved access we peel only if number of iterations in
1002 the prolog loop ({VF - misalignment}), is a multiple of the
1003 number of the interleaved accesses. */
1004 int elem_size, mis_in_elements;
1005 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1006
1007 /* FORNOW: handle only known alignment. */
1008 if (!known_alignment_for_access_p (dr))
1009 return false;
1010
1011 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1012 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1013
1014 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1015 return false;
1016 }
1017
1018 /* If misalignment is known at the compile time then allow peeling
1019 only if natural alignment is reachable through peeling. */
1020 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1021 {
1022 HOST_WIDE_INT elmsize =
1023 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1024 if (dump_enabled_p ())
1025 {
1026 dump_printf_loc (MSG_NOTE, vect_location,
1027 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1028 dump_printf (MSG_NOTE,
1029 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1030 }
1031 if (DR_MISALIGNMENT (dr) % elmsize)
1032 {
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "data size does not divide the misalignment.\n");
1036 return false;
1037 }
1038 }
1039
1040 if (!known_alignment_for_access_p (dr))
1041 {
1042 tree type = TREE_TYPE (DR_REF (dr));
1043 bool is_packed = not_size_aligned (DR_REF (dr));
1044 if (dump_enabled_p ())
1045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1046 "Unknown misalignment, is_packed = %d\n",is_packed);
1047 if ((TYPE_USER_ALIGN (type) && !is_packed)
1048 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1049 return true;
1050 else
1051 return false;
1052 }
1053
1054 return true;
1055 }
1056
1057
1058 /* Calculate the cost of the memory access represented by DR. */
1059
1060 static void
1061 vect_get_data_access_cost (struct data_reference *dr,
1062 unsigned int *inside_cost,
1063 unsigned int *outside_cost,
1064 stmt_vector_for_cost *body_cost_vec)
1065 {
1066 gimple stmt = DR_STMT (dr);
1067 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1068 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1069 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1070 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1071 int ncopies = vf / nunits;
1072
1073 if (DR_IS_READ (dr))
1074 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1075 NULL, body_cost_vec, false);
1076 else
1077 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1078
1079 if (dump_enabled_p ())
1080 dump_printf_loc (MSG_NOTE, vect_location,
1081 "vect_get_data_access_cost: inside_cost = %d, "
1082 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1083 }
1084
1085
1086 /* Insert DR into peeling hash table with NPEEL as key. */
1087
1088 static void
1089 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1090 int npeel)
1091 {
1092 struct _vect_peel_info elem, *slot;
1093 _vect_peel_info **new_slot;
1094 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1095
1096 elem.npeel = npeel;
1097 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1098 if (slot)
1099 slot->count++;
1100 else
1101 {
1102 slot = XNEW (struct _vect_peel_info);
1103 slot->npeel = npeel;
1104 slot->dr = dr;
1105 slot->count = 1;
1106 new_slot
1107 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1108 *new_slot = slot;
1109 }
1110
1111 if (!supportable_dr_alignment
1112 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1113 slot->count += VECT_MAX_COST;
1114 }
1115
1116
1117 /* Traverse peeling hash table to find peeling option that aligns maximum
1118 number of data accesses. */
1119
1120 int
1121 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1122 _vect_peel_extended_info *max)
1123 {
1124 vect_peel_info elem = *slot;
1125
1126 if (elem->count > max->peel_info.count
1127 || (elem->count == max->peel_info.count
1128 && max->peel_info.npeel > elem->npeel))
1129 {
1130 max->peel_info.npeel = elem->npeel;
1131 max->peel_info.count = elem->count;
1132 max->peel_info.dr = elem->dr;
1133 }
1134
1135 return 1;
1136 }
1137
1138
1139 /* Traverse peeling hash table and calculate cost for each peeling option.
1140 Find the one with the lowest cost. */
1141
1142 int
1143 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1144 _vect_peel_extended_info *min)
1145 {
1146 vect_peel_info elem = *slot;
1147 int save_misalignment, dummy;
1148 unsigned int inside_cost = 0, outside_cost = 0, i;
1149 gimple stmt = DR_STMT (elem->dr);
1150 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1151 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1152 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1153 struct data_reference *dr;
1154 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1155 int single_iter_cost;
1156
1157 prologue_cost_vec.create (2);
1158 body_cost_vec.create (2);
1159 epilogue_cost_vec.create (2);
1160
1161 FOR_EACH_VEC_ELT (datarefs, i, dr)
1162 {
1163 stmt = DR_STMT (dr);
1164 stmt_info = vinfo_for_stmt (stmt);
1165 /* For interleaving, only the alignment of the first access
1166 matters. */
1167 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1168 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1169 continue;
1170
1171 save_misalignment = DR_MISALIGNMENT (dr);
1172 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1173 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1174 &body_cost_vec);
1175 SET_DR_MISALIGNMENT (dr, save_misalignment);
1176 }
1177
1178 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1179 outside_cost += vect_get_known_peeling_cost
1180 (loop_vinfo, elem->npeel, &dummy,
1181 /* ??? We use this cost as number of stmts with scalar_stmt cost,
1182 thus divide by that. This introduces rounding errors, thus better
1183 introduce a new cost kind (raw_cost? scalar_iter_cost?). */
1184 single_iter_cost / vect_get_stmt_cost (scalar_stmt),
1185 &prologue_cost_vec, &epilogue_cost_vec);
1186
1187 /* Prologue and epilogue costs are added to the target model later.
1188 These costs depend only on the scalar iteration cost, the
1189 number of peeling iterations finally chosen, and the number of
1190 misaligned statements. So discard the information found here. */
1191 prologue_cost_vec.release ();
1192 epilogue_cost_vec.release ();
1193
1194 if (inside_cost < min->inside_cost
1195 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1196 {
1197 min->inside_cost = inside_cost;
1198 min->outside_cost = outside_cost;
1199 min->body_cost_vec.release ();
1200 min->body_cost_vec = body_cost_vec;
1201 min->peel_info.dr = elem->dr;
1202 min->peel_info.npeel = elem->npeel;
1203 }
1204 else
1205 body_cost_vec.release ();
1206
1207 return 1;
1208 }
1209
1210
1211 /* Choose best peeling option by traversing peeling hash table and either
1212 choosing an option with the lowest cost (if cost model is enabled) or the
1213 option that aligns as many accesses as possible. */
1214
1215 static struct data_reference *
1216 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1217 unsigned int *npeel,
1218 stmt_vector_for_cost *body_cost_vec)
1219 {
1220 struct _vect_peel_extended_info res;
1221
1222 res.peel_info.dr = NULL;
1223 res.body_cost_vec = stmt_vector_for_cost ();
1224
1225 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1226 {
1227 res.inside_cost = INT_MAX;
1228 res.outside_cost = INT_MAX;
1229 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1230 ->traverse <_vect_peel_extended_info *,
1231 vect_peeling_hash_get_lowest_cost> (&res);
1232 }
1233 else
1234 {
1235 res.peel_info.count = 0;
1236 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1237 ->traverse <_vect_peel_extended_info *,
1238 vect_peeling_hash_get_most_frequent> (&res);
1239 }
1240
1241 *npeel = res.peel_info.npeel;
1242 *body_cost_vec = res.body_cost_vec;
1243 return res.peel_info.dr;
1244 }
1245
1246
1247 /* Function vect_enhance_data_refs_alignment
1248
1249 This pass will use loop versioning and loop peeling in order to enhance
1250 the alignment of data references in the loop.
1251
1252 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253 original loop is to be vectorized. Any other loops that are created by
1254 the transformations performed in this pass - are not supposed to be
1255 vectorized. This restriction will be relaxed.
1256
1257 This pass will require a cost model to guide it whether to apply peeling
1258 or versioning or a combination of the two. For example, the scheme that
1259 intel uses when given a loop with several memory accesses, is as follows:
1260 choose one memory access ('p') which alignment you want to force by doing
1261 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1262 other accesses are not necessarily aligned, or (2) use loop versioning to
1263 generate one loop in which all accesses are aligned, and another loop in
1264 which only 'p' is necessarily aligned.
1265
1266 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1269
1270 Devising a cost model is the most critical aspect of this work. It will
1271 guide us on which access to peel for, whether to use loop versioning, how
1272 many versions to create, etc. The cost model will probably consist of
1273 generic considerations as well as target specific considerations (on
1274 powerpc for example, misaligned stores are more painful than misaligned
1275 loads).
1276
1277 Here are the general steps involved in alignment enhancements:
1278
1279 -- original loop, before alignment analysis:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 }
1284
1285 -- After vect_compute_data_refs_alignment:
1286 for (i=0; i<N; i++){
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1289 }
1290
1291 -- Possibility 1: we do loop versioning:
1292 if (p is aligned) {
1293 for (i=0; i<N; i++){ # loop 1A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1296 }
1297 }
1298 else {
1299 for (i=0; i<N; i++){ # loop 1B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 3
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1302 }
1303 }
1304
1305 -- Possibility 2: we do loop peeling:
1306 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1307 x = q[i];
1308 p[i] = y;
1309 }
1310 for (i = 3; i < N; i++){ # loop 2A
1311 x = q[i]; # DR_MISALIGNMENT(q) = 0
1312 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1313 }
1314
1315 -- Possibility 3: combination of loop peeling and versioning:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1317 x = q[i];
1318 p[i] = y;
1319 }
1320 if (p is aligned) {
1321 for (i = 3; i<N; i++){ # loop 3A
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = 0
1324 }
1325 }
1326 else {
1327 for (i = 3; i<N; i++){ # loop 3B
1328 x = q[i]; # DR_MISALIGNMENT(q) = 0
1329 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1330 }
1331 }
1332
1333 These loops are later passed to loop_transform to be vectorized. The
1334 vectorizer will use the alignment information to guide the transformation
1335 (whether to generate regular loads/stores, or with special handling for
1336 misalignment). */
1337
1338 bool
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1340 {
1341 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1343 enum dr_alignment_support supportable_dr_alignment;
1344 struct data_reference *dr0 = NULL, *first_store = NULL;
1345 struct data_reference *dr;
1346 unsigned int i, j;
1347 bool do_peeling = false;
1348 bool do_versioning = false;
1349 bool stat;
1350 gimple stmt;
1351 stmt_vec_info stmt_info;
1352 unsigned int npeel = 0;
1353 bool all_misalignments_unknown = true;
1354 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1355 unsigned possible_npeel_number = 1;
1356 tree vectype;
1357 unsigned int nelements, mis, same_align_drs_max = 0;
1358 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1359
1360 if (dump_enabled_p ())
1361 dump_printf_loc (MSG_NOTE, vect_location,
1362 "=== vect_enhance_data_refs_alignment ===\n");
1363
1364 /* While cost model enhancements are expected in the future, the high level
1365 view of the code at this time is as follows:
1366
1367 A) If there is a misaligned access then see if peeling to align
1368 this access can make all data references satisfy
1369 vect_supportable_dr_alignment. If so, update data structures
1370 as needed and return true.
1371
1372 B) If peeling wasn't possible and there is a data reference with an
1373 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1374 then see if loop versioning checks can be used to make all data
1375 references satisfy vect_supportable_dr_alignment. If so, update
1376 data structures as needed and return true.
1377
1378 C) If neither peeling nor versioning were successful then return false if
1379 any data reference does not satisfy vect_supportable_dr_alignment.
1380
1381 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1382
1383 Note, Possibility 3 above (which is peeling and versioning together) is not
1384 being done at this time. */
1385
1386 /* (1) Peeling to force alignment. */
1387
1388 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1389 Considerations:
1390 + How many accesses will become aligned due to the peeling
1391 - How many accesses will become unaligned due to the peeling,
1392 and the cost of misaligned accesses.
1393 - The cost of peeling (the extra runtime checks, the increase
1394 in code size). */
1395
1396 FOR_EACH_VEC_ELT (datarefs, i, dr)
1397 {
1398 stmt = DR_STMT (dr);
1399 stmt_info = vinfo_for_stmt (stmt);
1400
1401 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1402 continue;
1403
1404 /* For interleaving, only the alignment of the first access
1405 matters. */
1406 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1407 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1408 continue;
1409
1410 /* For invariant accesses there is nothing to enhance. */
1411 if (integer_zerop (DR_STEP (dr)))
1412 continue;
1413
1414 /* Strided loads perform only component accesses, alignment is
1415 irrelevant for them. */
1416 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1417 continue;
1418
1419 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1420 do_peeling = vector_alignment_reachable_p (dr);
1421 if (do_peeling)
1422 {
1423 if (known_alignment_for_access_p (dr))
1424 {
1425 unsigned int npeel_tmp;
1426 bool negative = tree_int_cst_compare (DR_STEP (dr),
1427 size_zero_node) < 0;
1428
1429 /* Save info about DR in the hash table. */
1430 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1431 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1432 = new hash_table<peel_info_hasher> (1);
1433
1434 vectype = STMT_VINFO_VECTYPE (stmt_info);
1435 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1436 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1437 TREE_TYPE (DR_REF (dr))));
1438 npeel_tmp = (negative
1439 ? (mis - nelements) : (nelements - mis))
1440 & (nelements - 1);
1441
1442 /* For multiple types, it is possible that the bigger type access
1443 will have more than one peeling option. E.g., a loop with two
1444 types: one of size (vector size / 4), and the other one of
1445 size (vector size / 8). Vectorization factor will 8. If both
1446 access are misaligned by 3, the first one needs one scalar
1447 iteration to be aligned, and the second one needs 5. But the
1448 the first one will be aligned also by peeling 5 scalar
1449 iterations, and in that case both accesses will be aligned.
1450 Hence, except for the immediate peeling amount, we also want
1451 to try to add full vector size, while we don't exceed
1452 vectorization factor.
1453 We do this automtically for cost model, since we calculate cost
1454 for every peeling option. */
1455 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1456 possible_npeel_number = vf /nelements;
1457
1458 /* Handle the aligned case. We may decide to align some other
1459 access, making DR unaligned. */
1460 if (DR_MISALIGNMENT (dr) == 0)
1461 {
1462 npeel_tmp = 0;
1463 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1464 possible_npeel_number++;
1465 }
1466
1467 for (j = 0; j < possible_npeel_number; j++)
1468 {
1469 gcc_assert (npeel_tmp <= vf);
1470 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1471 npeel_tmp += nelements;
1472 }
1473
1474 all_misalignments_unknown = false;
1475 /* Data-ref that was chosen for the case that all the
1476 misalignments are unknown is not relevant anymore, since we
1477 have a data-ref with known alignment. */
1478 dr0 = NULL;
1479 }
1480 else
1481 {
1482 /* If we don't know any misalignment values, we prefer
1483 peeling for data-ref that has the maximum number of data-refs
1484 with the same alignment, unless the target prefers to align
1485 stores over load. */
1486 if (all_misalignments_unknown)
1487 {
1488 unsigned same_align_drs
1489 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1490 if (!dr0
1491 || same_align_drs_max < same_align_drs)
1492 {
1493 same_align_drs_max = same_align_drs;
1494 dr0 = dr;
1495 }
1496 /* For data-refs with the same number of related
1497 accesses prefer the one where the misalign
1498 computation will be invariant in the outermost loop. */
1499 else if (same_align_drs_max == same_align_drs)
1500 {
1501 struct loop *ivloop0, *ivloop;
1502 ivloop0 = outermost_invariant_loop_for_expr
1503 (loop, DR_BASE_ADDRESS (dr0));
1504 ivloop = outermost_invariant_loop_for_expr
1505 (loop, DR_BASE_ADDRESS (dr));
1506 if ((ivloop && !ivloop0)
1507 || (ivloop && ivloop0
1508 && flow_loop_nested_p (ivloop, ivloop0)))
1509 dr0 = dr;
1510 }
1511
1512 if (!first_store && DR_IS_WRITE (dr))
1513 first_store = dr;
1514 }
1515
1516 /* If there are both known and unknown misaligned accesses in the
1517 loop, we choose peeling amount according to the known
1518 accesses. */
1519 if (!supportable_dr_alignment)
1520 {
1521 dr0 = dr;
1522 if (!first_store && DR_IS_WRITE (dr))
1523 first_store = dr;
1524 }
1525 }
1526 }
1527 else
1528 {
1529 if (!aligned_access_p (dr))
1530 {
1531 if (dump_enabled_p ())
1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533 "vector alignment may not be reachable\n");
1534 break;
1535 }
1536 }
1537 }
1538
1539 /* Check if we can possibly peel the loop. */
1540 if (!vect_can_advance_ivs_p (loop_vinfo)
1541 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1542 do_peeling = false;
1543
1544 /* If we don't know how many times the peeling loop will run
1545 assume it will run VF-1 times and disable peeling if the remaining
1546 iters are less than the vectorization factor. */
1547 if (do_peeling
1548 && all_misalignments_unknown
1549 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1550 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1551 < 2 * (unsigned) LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1))
1552 do_peeling = false;
1553
1554 if (do_peeling
1555 && all_misalignments_unknown
1556 && vect_supportable_dr_alignment (dr0, false))
1557 {
1558 /* Check if the target requires to prefer stores over loads, i.e., if
1559 misaligned stores are more expensive than misaligned loads (taking
1560 drs with same alignment into account). */
1561 if (first_store && DR_IS_READ (dr0))
1562 {
1563 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1564 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1565 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1566 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1567 stmt_vector_for_cost dummy;
1568 dummy.create (2);
1569
1570 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1571 &dummy);
1572 vect_get_data_access_cost (first_store, &store_inside_cost,
1573 &store_outside_cost, &dummy);
1574
1575 dummy.release ();
1576
1577 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1578 aligning the load DR0). */
1579 load_inside_penalty = store_inside_cost;
1580 load_outside_penalty = store_outside_cost;
1581 for (i = 0;
1582 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1583 DR_STMT (first_store))).iterate (i, &dr);
1584 i++)
1585 if (DR_IS_READ (dr))
1586 {
1587 load_inside_penalty += load_inside_cost;
1588 load_outside_penalty += load_outside_cost;
1589 }
1590 else
1591 {
1592 load_inside_penalty += store_inside_cost;
1593 load_outside_penalty += store_outside_cost;
1594 }
1595
1596 /* Calculate the penalty for leaving DR0 unaligned (by
1597 aligning the FIRST_STORE). */
1598 store_inside_penalty = load_inside_cost;
1599 store_outside_penalty = load_outside_cost;
1600 for (i = 0;
1601 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1602 DR_STMT (dr0))).iterate (i, &dr);
1603 i++)
1604 if (DR_IS_READ (dr))
1605 {
1606 store_inside_penalty += load_inside_cost;
1607 store_outside_penalty += load_outside_cost;
1608 }
1609 else
1610 {
1611 store_inside_penalty += store_inside_cost;
1612 store_outside_penalty += store_outside_cost;
1613 }
1614
1615 if (load_inside_penalty > store_inside_penalty
1616 || (load_inside_penalty == store_inside_penalty
1617 && load_outside_penalty > store_outside_penalty))
1618 dr0 = first_store;
1619 }
1620
1621 /* In case there are only loads with different unknown misalignments, use
1622 peeling only if it may help to align other accesses in the loop. */
1623 if (!first_store
1624 && !STMT_VINFO_SAME_ALIGN_REFS (
1625 vinfo_for_stmt (DR_STMT (dr0))).length ()
1626 && vect_supportable_dr_alignment (dr0, false)
1627 != dr_unaligned_supported)
1628 do_peeling = false;
1629 }
1630
1631 if (do_peeling && !dr0)
1632 {
1633 /* Peeling is possible, but there is no data access that is not supported
1634 unless aligned. So we try to choose the best possible peeling. */
1635
1636 /* We should get here only if there are drs with known misalignment. */
1637 gcc_assert (!all_misalignments_unknown);
1638
1639 /* Choose the best peeling from the hash table. */
1640 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1641 &body_cost_vec);
1642 if (!dr0 || !npeel)
1643 do_peeling = false;
1644
1645 /* If peeling by npeel will result in a remaining loop not iterating
1646 enough to be vectorized then do not peel. */
1647 if (do_peeling
1648 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1649 && (LOOP_VINFO_INT_NITERS (loop_vinfo)
1650 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + npeel))
1651 do_peeling = false;
1652 }
1653
1654 if (do_peeling)
1655 {
1656 stmt = DR_STMT (dr0);
1657 stmt_info = vinfo_for_stmt (stmt);
1658 vectype = STMT_VINFO_VECTYPE (stmt_info);
1659 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1660
1661 if (known_alignment_for_access_p (dr0))
1662 {
1663 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1664 size_zero_node) < 0;
1665 if (!npeel)
1666 {
1667 /* Since it's known at compile time, compute the number of
1668 iterations in the peeled loop (the peeling factor) for use in
1669 updating DR_MISALIGNMENT values. The peeling factor is the
1670 vectorization factor minus the misalignment as an element
1671 count. */
1672 mis = DR_MISALIGNMENT (dr0);
1673 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1674 npeel = ((negative ? mis - nelements : nelements - mis)
1675 & (nelements - 1));
1676 }
1677
1678 /* For interleaved data access every iteration accesses all the
1679 members of the group, therefore we divide the number of iterations
1680 by the group size. */
1681 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1682 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1683 npeel /= GROUP_SIZE (stmt_info);
1684
1685 if (dump_enabled_p ())
1686 dump_printf_loc (MSG_NOTE, vect_location,
1687 "Try peeling by %d\n", npeel);
1688 }
1689
1690 /* Ensure that all data refs can be vectorized after the peel. */
1691 FOR_EACH_VEC_ELT (datarefs, i, dr)
1692 {
1693 int save_misalignment;
1694
1695 if (dr == dr0)
1696 continue;
1697
1698 stmt = DR_STMT (dr);
1699 stmt_info = vinfo_for_stmt (stmt);
1700 /* For interleaving, only the alignment of the first access
1701 matters. */
1702 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1703 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1704 continue;
1705
1706 /* Strided loads perform only component accesses, alignment is
1707 irrelevant for them. */
1708 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1709 continue;
1710
1711 save_misalignment = DR_MISALIGNMENT (dr);
1712 vect_update_misalignment_for_peel (dr, dr0, npeel);
1713 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1714 SET_DR_MISALIGNMENT (dr, save_misalignment);
1715
1716 if (!supportable_dr_alignment)
1717 {
1718 do_peeling = false;
1719 break;
1720 }
1721 }
1722
1723 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1724 {
1725 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1726 if (!stat)
1727 do_peeling = false;
1728 else
1729 {
1730 body_cost_vec.release ();
1731 return stat;
1732 }
1733 }
1734
1735 if (do_peeling)
1736 {
1737 unsigned max_allowed_peel
1738 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1739 if (max_allowed_peel != (unsigned)-1)
1740 {
1741 unsigned max_peel = npeel;
1742 if (max_peel == 0)
1743 {
1744 gimple dr_stmt = DR_STMT (dr0);
1745 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1746 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1747 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1748 }
1749 if (max_peel > max_allowed_peel)
1750 {
1751 do_peeling = false;
1752 if (dump_enabled_p ())
1753 dump_printf_loc (MSG_NOTE, vect_location,
1754 "Disable peeling, max peels reached: %d\n", max_peel);
1755 }
1756 }
1757 }
1758
1759 if (do_peeling)
1760 {
1761 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1762 If the misalignment of DR_i is identical to that of dr0 then set
1763 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1764 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1765 by the peeling factor times the element size of DR_i (MOD the
1766 vectorization factor times the size). Otherwise, the
1767 misalignment of DR_i must be set to unknown. */
1768 FOR_EACH_VEC_ELT (datarefs, i, dr)
1769 if (dr != dr0)
1770 vect_update_misalignment_for_peel (dr, dr0, npeel);
1771
1772 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1773 if (npeel)
1774 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1775 else
1776 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1777 = DR_MISALIGNMENT (dr0);
1778 SET_DR_MISALIGNMENT (dr0, 0);
1779 if (dump_enabled_p ())
1780 {
1781 dump_printf_loc (MSG_NOTE, vect_location,
1782 "Alignment of access forced using peeling.\n");
1783 dump_printf_loc (MSG_NOTE, vect_location,
1784 "Peeling for alignment will be applied.\n");
1785 }
1786 /* The inside-loop cost will be accounted for in vectorizable_load
1787 and vectorizable_store correctly with adjusted alignments.
1788 Drop the body_cst_vec on the floor here. */
1789 body_cost_vec.release ();
1790
1791 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1792 gcc_assert (stat);
1793 return stat;
1794 }
1795 }
1796
1797 body_cost_vec.release ();
1798
1799 /* (2) Versioning to force alignment. */
1800
1801 /* Try versioning if:
1802 1) optimize loop for speed
1803 2) there is at least one unsupported misaligned data ref with an unknown
1804 misalignment, and
1805 3) all misaligned data refs with a known misalignment are supported, and
1806 4) the number of runtime alignment checks is within reason. */
1807
1808 do_versioning =
1809 optimize_loop_nest_for_speed_p (loop)
1810 && (!loop->inner); /* FORNOW */
1811
1812 if (do_versioning)
1813 {
1814 FOR_EACH_VEC_ELT (datarefs, i, dr)
1815 {
1816 stmt = DR_STMT (dr);
1817 stmt_info = vinfo_for_stmt (stmt);
1818
1819 /* For interleaving, only the alignment of the first access
1820 matters. */
1821 if (aligned_access_p (dr)
1822 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1823 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1824 continue;
1825
1826 /* Strided loads perform only component accesses, alignment is
1827 irrelevant for them. */
1828 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1829 continue;
1830
1831 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1832
1833 if (!supportable_dr_alignment)
1834 {
1835 gimple stmt;
1836 int mask;
1837 tree vectype;
1838
1839 if (known_alignment_for_access_p (dr)
1840 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1841 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1842 {
1843 do_versioning = false;
1844 break;
1845 }
1846
1847 stmt = DR_STMT (dr);
1848 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1849 gcc_assert (vectype);
1850
1851 /* The rightmost bits of an aligned address must be zeros.
1852 Construct the mask needed for this test. For example,
1853 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1854 mask must be 15 = 0xf. */
1855 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1856
1857 /* FORNOW: use the same mask to test all potentially unaligned
1858 references in the loop. The vectorizer currently supports
1859 a single vector size, see the reference to
1860 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1861 vectorization factor is computed. */
1862 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1863 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1864 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1865 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1866 DR_STMT (dr));
1867 }
1868 }
1869
1870 /* Versioning requires at least one misaligned data reference. */
1871 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1872 do_versioning = false;
1873 else if (!do_versioning)
1874 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1875 }
1876
1877 if (do_versioning)
1878 {
1879 vec<gimple> may_misalign_stmts
1880 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1881 gimple stmt;
1882
1883 /* It can now be assumed that the data references in the statements
1884 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1885 of the loop being vectorized. */
1886 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1887 {
1888 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1889 dr = STMT_VINFO_DATA_REF (stmt_info);
1890 SET_DR_MISALIGNMENT (dr, 0);
1891 if (dump_enabled_p ())
1892 dump_printf_loc (MSG_NOTE, vect_location,
1893 "Alignment of access forced using versioning.\n");
1894 }
1895
1896 if (dump_enabled_p ())
1897 dump_printf_loc (MSG_NOTE, vect_location,
1898 "Versioning for alignment will be applied.\n");
1899
1900 /* Peeling and versioning can't be done together at this time. */
1901 gcc_assert (! (do_peeling && do_versioning));
1902
1903 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1904 gcc_assert (stat);
1905 return stat;
1906 }
1907
1908 /* This point is reached if neither peeling nor versioning is being done. */
1909 gcc_assert (! (do_peeling || do_versioning));
1910
1911 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1912 return stat;
1913 }
1914
1915
1916 /* Function vect_find_same_alignment_drs.
1917
1918 Update group and alignment relations according to the chosen
1919 vectorization factor. */
1920
1921 static void
1922 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1923 loop_vec_info loop_vinfo)
1924 {
1925 unsigned int i;
1926 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1927 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1928 struct data_reference *dra = DDR_A (ddr);
1929 struct data_reference *drb = DDR_B (ddr);
1930 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1931 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1932 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1933 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1934 lambda_vector dist_v;
1935 unsigned int loop_depth;
1936
1937 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1938 return;
1939
1940 if (dra == drb)
1941 return;
1942
1943 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1944 return;
1945
1946 /* Loop-based vectorization and known data dependence. */
1947 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1948 return;
1949
1950 /* Data-dependence analysis reports a distance vector of zero
1951 for data-references that overlap only in the first iteration
1952 but have different sign step (see PR45764).
1953 So as a sanity check require equal DR_STEP. */
1954 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1955 return;
1956
1957 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1958 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1959 {
1960 int dist = dist_v[loop_depth];
1961
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "dependence distance = %d.\n", dist);
1965
1966 /* Same loop iteration. */
1967 if (dist == 0
1968 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1969 {
1970 /* Two references with distance zero have the same alignment. */
1971 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1972 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1973 if (dump_enabled_p ())
1974 {
1975 dump_printf_loc (MSG_NOTE, vect_location,
1976 "accesses have the same alignment.\n");
1977 dump_printf (MSG_NOTE,
1978 "dependence distance modulo vf == 0 between ");
1979 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1980 dump_printf (MSG_NOTE, " and ");
1981 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1982 dump_printf (MSG_NOTE, "\n");
1983 }
1984 }
1985 }
1986 }
1987
1988
1989 /* Function vect_analyze_data_refs_alignment
1990
1991 Analyze the alignment of the data-references in the loop.
1992 Return FALSE if a data reference is found that cannot be vectorized. */
1993
1994 bool
1995 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1996 bb_vec_info bb_vinfo)
1997 {
1998 if (dump_enabled_p ())
1999 dump_printf_loc (MSG_NOTE, vect_location,
2000 "=== vect_analyze_data_refs_alignment ===\n");
2001
2002 /* Mark groups of data references with same alignment using
2003 data dependence information. */
2004 if (loop_vinfo)
2005 {
2006 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2007 struct data_dependence_relation *ddr;
2008 unsigned int i;
2009
2010 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2011 vect_find_same_alignment_drs (ddr, loop_vinfo);
2012 }
2013
2014 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2015 {
2016 if (dump_enabled_p ())
2017 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018 "not vectorized: can't calculate alignment "
2019 "for data ref.\n");
2020 return false;
2021 }
2022
2023 return true;
2024 }
2025
2026
2027 /* Analyze groups of accesses: check that DR belongs to a group of
2028 accesses of legal size, step, etc. Detect gaps, single element
2029 interleaving, and other special cases. Set grouped access info.
2030 Collect groups of strided stores for further use in SLP analysis. */
2031
2032 static bool
2033 vect_analyze_group_access (struct data_reference *dr)
2034 {
2035 tree step = DR_STEP (dr);
2036 tree scalar_type = TREE_TYPE (DR_REF (dr));
2037 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2038 gimple stmt = DR_STMT (dr);
2039 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2040 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2041 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2042 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2043 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2044 bool slp_impossible = false;
2045 struct loop *loop = NULL;
2046
2047 if (loop_vinfo)
2048 loop = LOOP_VINFO_LOOP (loop_vinfo);
2049
2050 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2051 size of the interleaving group (including gaps). */
2052 groupsize = absu_hwi (dr_step) / type_size;
2053
2054 /* Not consecutive access is possible only if it is a part of interleaving. */
2055 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2056 {
2057 /* Check if it this DR is a part of interleaving, and is a single
2058 element of the group that is accessed in the loop. */
2059
2060 /* Gaps are supported only for loads. STEP must be a multiple of the type
2061 size. The size of the group must be a power of 2. */
2062 if (DR_IS_READ (dr)
2063 && (dr_step % type_size) == 0
2064 && groupsize > 0
2065 && exact_log2 (groupsize) != -1)
2066 {
2067 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2068 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2069 if (dump_enabled_p ())
2070 {
2071 dump_printf_loc (MSG_NOTE, vect_location,
2072 "Detected single element interleaving ");
2073 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2074 dump_printf (MSG_NOTE, " step ");
2075 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2076 dump_printf (MSG_NOTE, "\n");
2077 }
2078
2079 if (loop_vinfo)
2080 {
2081 if (dump_enabled_p ())
2082 dump_printf_loc (MSG_NOTE, vect_location,
2083 "Data access with gaps requires scalar "
2084 "epilogue loop\n");
2085 if (loop->inner)
2086 {
2087 if (dump_enabled_p ())
2088 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089 "Peeling for outer loop is not"
2090 " supported\n");
2091 return false;
2092 }
2093
2094 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2095 }
2096
2097 return true;
2098 }
2099
2100 if (dump_enabled_p ())
2101 {
2102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103 "not consecutive access ");
2104 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2105 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2106 }
2107
2108 if (bb_vinfo)
2109 {
2110 /* Mark the statement as unvectorizable. */
2111 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2112 return true;
2113 }
2114
2115 return false;
2116 }
2117
2118 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2119 {
2120 /* First stmt in the interleaving chain. Check the chain. */
2121 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2122 struct data_reference *data_ref = dr;
2123 unsigned int count = 1;
2124 tree prev_init = DR_INIT (data_ref);
2125 gimple prev = stmt;
2126 HOST_WIDE_INT diff, gaps = 0;
2127 unsigned HOST_WIDE_INT count_in_bytes;
2128
2129 while (next)
2130 {
2131 /* Skip same data-refs. In case that two or more stmts share
2132 data-ref (supported only for loads), we vectorize only the first
2133 stmt, and the rest get their vectorized loads from the first
2134 one. */
2135 if (!tree_int_cst_compare (DR_INIT (data_ref),
2136 DR_INIT (STMT_VINFO_DATA_REF (
2137 vinfo_for_stmt (next)))))
2138 {
2139 if (DR_IS_WRITE (data_ref))
2140 {
2141 if (dump_enabled_p ())
2142 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2143 "Two store stmts share the same dr.\n");
2144 return false;
2145 }
2146
2147 /* For load use the same data-ref load. */
2148 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2149
2150 prev = next;
2151 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2152 continue;
2153 }
2154
2155 prev = next;
2156 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2157
2158 /* All group members have the same STEP by construction. */
2159 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2160
2161 /* Check that the distance between two accesses is equal to the type
2162 size. Otherwise, we have gaps. */
2163 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2164 - TREE_INT_CST_LOW (prev_init)) / type_size;
2165 if (diff != 1)
2166 {
2167 /* FORNOW: SLP of accesses with gaps is not supported. */
2168 slp_impossible = true;
2169 if (DR_IS_WRITE (data_ref))
2170 {
2171 if (dump_enabled_p ())
2172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173 "interleaved store with gaps\n");
2174 return false;
2175 }
2176
2177 gaps += diff - 1;
2178 }
2179
2180 last_accessed_element += diff;
2181
2182 /* Store the gap from the previous member of the group. If there is no
2183 gap in the access, GROUP_GAP is always 1. */
2184 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2185
2186 prev_init = DR_INIT (data_ref);
2187 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2188 /* Count the number of data-refs in the chain. */
2189 count++;
2190 }
2191
2192 /* COUNT is the number of accesses found, we multiply it by the size of
2193 the type to get COUNT_IN_BYTES. */
2194 count_in_bytes = type_size * count;
2195
2196 /* Check that the size of the interleaving (including gaps) is not
2197 greater than STEP. */
2198 if (dr_step != 0
2199 && absu_hwi (dr_step) < count_in_bytes + gaps * type_size)
2200 {
2201 if (dump_enabled_p ())
2202 {
2203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204 "interleaving size is greater than step for ");
2205 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2206 DR_REF (dr));
2207 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2208 }
2209 return false;
2210 }
2211
2212 /* Check that the size of the interleaving is equal to STEP for stores,
2213 i.e., that there are no gaps. */
2214 if (dr_step != 0
2215 && absu_hwi (dr_step) != count_in_bytes)
2216 {
2217 if (DR_IS_READ (dr))
2218 {
2219 slp_impossible = true;
2220 /* There is a gap after the last load in the group. This gap is a
2221 difference between the groupsize and the number of elements.
2222 When there is no gap, this difference should be 0. */
2223 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2224 }
2225 else
2226 {
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2229 "interleaved store with gaps\n");
2230 return false;
2231 }
2232 }
2233
2234 /* Check that STEP is a multiple of type size. */
2235 if (dr_step != 0
2236 && (dr_step % type_size) != 0)
2237 {
2238 if (dump_enabled_p ())
2239 {
2240 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2241 "step is not a multiple of type size: step ");
2242 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2243 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2244 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2245 TYPE_SIZE_UNIT (scalar_type));
2246 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2247 }
2248 return false;
2249 }
2250
2251 if (groupsize == 0)
2252 groupsize = count;
2253
2254 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_NOTE, vect_location,
2257 "Detected interleaving of size %d\n", (int)groupsize);
2258
2259 /* SLP: create an SLP data structure for every interleaving group of
2260 stores for further analysis in vect_analyse_slp. */
2261 if (DR_IS_WRITE (dr) && !slp_impossible)
2262 {
2263 if (loop_vinfo)
2264 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2265 if (bb_vinfo)
2266 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2267 }
2268
2269 /* There is a gap in the end of the group. */
2270 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2271 {
2272 if (dump_enabled_p ())
2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274 "Data access with gaps requires scalar "
2275 "epilogue loop\n");
2276 if (loop->inner)
2277 {
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280 "Peeling for outer loop is not supported\n");
2281 return false;
2282 }
2283
2284 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2285 }
2286 }
2287
2288 return true;
2289 }
2290
2291
2292 /* Analyze the access pattern of the data-reference DR.
2293 In case of non-consecutive accesses call vect_analyze_group_access() to
2294 analyze groups of accesses. */
2295
2296 static bool
2297 vect_analyze_data_ref_access (struct data_reference *dr)
2298 {
2299 tree step = DR_STEP (dr);
2300 tree scalar_type = TREE_TYPE (DR_REF (dr));
2301 gimple stmt = DR_STMT (dr);
2302 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2303 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2304 struct loop *loop = NULL;
2305
2306 if (loop_vinfo)
2307 loop = LOOP_VINFO_LOOP (loop_vinfo);
2308
2309 if (loop_vinfo && !step)
2310 {
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313 "bad data-ref access in loop\n");
2314 return false;
2315 }
2316
2317 /* Allow invariant loads in not nested loops. */
2318 if (loop_vinfo && integer_zerop (step))
2319 {
2320 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2321 if (nested_in_vect_loop_p (loop, stmt))
2322 {
2323 if (dump_enabled_p ())
2324 dump_printf_loc (MSG_NOTE, vect_location,
2325 "zero step in inner loop of nest\n");
2326 return false;
2327 }
2328 return DR_IS_READ (dr);
2329 }
2330
2331 if (loop && nested_in_vect_loop_p (loop, stmt))
2332 {
2333 /* Interleaved accesses are not yet supported within outer-loop
2334 vectorization for references in the inner-loop. */
2335 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2336
2337 /* For the rest of the analysis we use the outer-loop step. */
2338 step = STMT_VINFO_DR_STEP (stmt_info);
2339 if (integer_zerop (step))
2340 {
2341 if (dump_enabled_p ())
2342 dump_printf_loc (MSG_NOTE, vect_location,
2343 "zero step in outer loop.\n");
2344 if (DR_IS_READ (dr))
2345 return true;
2346 else
2347 return false;
2348 }
2349 }
2350
2351 /* Consecutive? */
2352 if (TREE_CODE (step) == INTEGER_CST)
2353 {
2354 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2355 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2356 || (dr_step < 0
2357 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2358 {
2359 /* Mark that it is not interleaving. */
2360 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2361 return true;
2362 }
2363 }
2364
2365 if (loop && nested_in_vect_loop_p (loop, stmt))
2366 {
2367 if (dump_enabled_p ())
2368 dump_printf_loc (MSG_NOTE, vect_location,
2369 "grouped access in outer loop.\n");
2370 return false;
2371 }
2372
2373 /* Assume this is a DR handled by non-constant strided load case. */
2374 if (TREE_CODE (step) != INTEGER_CST)
2375 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2376
2377 /* Not consecutive access - check if it's a part of interleaving group. */
2378 return vect_analyze_group_access (dr);
2379 }
2380
2381
2382
2383 /* A helper function used in the comparator function to sort data
2384 references. T1 and T2 are two data references to be compared.
2385 The function returns -1, 0, or 1. */
2386
2387 static int
2388 compare_tree (tree t1, tree t2)
2389 {
2390 int i, cmp;
2391 enum tree_code code;
2392 char tclass;
2393
2394 if (t1 == t2)
2395 return 0;
2396 if (t1 == NULL)
2397 return -1;
2398 if (t2 == NULL)
2399 return 1;
2400
2401
2402 if (TREE_CODE (t1) != TREE_CODE (t2))
2403 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2404
2405 code = TREE_CODE (t1);
2406 switch (code)
2407 {
2408 /* For const values, we can just use hash values for comparisons. */
2409 case INTEGER_CST:
2410 case REAL_CST:
2411 case FIXED_CST:
2412 case STRING_CST:
2413 case COMPLEX_CST:
2414 case VECTOR_CST:
2415 {
2416 hashval_t h1 = iterative_hash_expr (t1, 0);
2417 hashval_t h2 = iterative_hash_expr (t2, 0);
2418 if (h1 != h2)
2419 return h1 < h2 ? -1 : 1;
2420 break;
2421 }
2422
2423 case SSA_NAME:
2424 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2425 if (cmp != 0)
2426 return cmp;
2427
2428 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2429 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2430 break;
2431
2432 default:
2433 tclass = TREE_CODE_CLASS (code);
2434
2435 /* For var-decl, we could compare their UIDs. */
2436 if (tclass == tcc_declaration)
2437 {
2438 if (DECL_UID (t1) != DECL_UID (t2))
2439 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2440 break;
2441 }
2442
2443 /* For expressions with operands, compare their operands recursively. */
2444 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2445 {
2446 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2447 if (cmp != 0)
2448 return cmp;
2449 }
2450 }
2451
2452 return 0;
2453 }
2454
2455
2456 /* Compare two data-references DRA and DRB to group them into chunks
2457 suitable for grouping. */
2458
2459 static int
2460 dr_group_sort_cmp (const void *dra_, const void *drb_)
2461 {
2462 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2463 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2464 int cmp;
2465
2466 /* Stabilize sort. */
2467 if (dra == drb)
2468 return 0;
2469
2470 /* Ordering of DRs according to base. */
2471 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2472 {
2473 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2474 if (cmp != 0)
2475 return cmp;
2476 }
2477
2478 /* And according to DR_OFFSET. */
2479 if (!dr_equal_offsets_p (dra, drb))
2480 {
2481 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2482 if (cmp != 0)
2483 return cmp;
2484 }
2485
2486 /* Put reads before writes. */
2487 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2488 return DR_IS_READ (dra) ? -1 : 1;
2489
2490 /* Then sort after access size. */
2491 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2492 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2493 {
2494 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2495 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2496 if (cmp != 0)
2497 return cmp;
2498 }
2499
2500 /* And after step. */
2501 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2502 {
2503 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2504 if (cmp != 0)
2505 return cmp;
2506 }
2507
2508 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2509 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2510 if (cmp == 0)
2511 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2512 return cmp;
2513 }
2514
2515 /* Function vect_analyze_data_ref_accesses.
2516
2517 Analyze the access pattern of all the data references in the loop.
2518
2519 FORNOW: the only access pattern that is considered vectorizable is a
2520 simple step 1 (consecutive) access.
2521
2522 FORNOW: handle only arrays and pointer accesses. */
2523
2524 bool
2525 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2526 {
2527 unsigned int i;
2528 vec<data_reference_p> datarefs;
2529 struct data_reference *dr;
2530
2531 if (dump_enabled_p ())
2532 dump_printf_loc (MSG_NOTE, vect_location,
2533 "=== vect_analyze_data_ref_accesses ===\n");
2534
2535 if (loop_vinfo)
2536 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2537 else
2538 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2539
2540 if (datarefs.is_empty ())
2541 return true;
2542
2543 /* Sort the array of datarefs to make building the interleaving chains
2544 linear. Don't modify the original vector's order, it is needed for
2545 determining what dependencies are reversed. */
2546 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2547 datarefs_copy.qsort (dr_group_sort_cmp);
2548
2549 /* Build the interleaving chains. */
2550 for (i = 0; i < datarefs_copy.length () - 1;)
2551 {
2552 data_reference_p dra = datarefs_copy[i];
2553 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2554 stmt_vec_info lastinfo = NULL;
2555 for (i = i + 1; i < datarefs_copy.length (); ++i)
2556 {
2557 data_reference_p drb = datarefs_copy[i];
2558 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2559
2560 /* ??? Imperfect sorting (non-compatible types, non-modulo
2561 accesses, same accesses) can lead to a group to be artificially
2562 split here as we don't just skip over those. If it really
2563 matters we can push those to a worklist and re-iterate
2564 over them. The we can just skip ahead to the next DR here. */
2565
2566 /* Check that the data-refs have same first location (except init)
2567 and they are both either store or load (not load and store,
2568 not masked loads or stores). */
2569 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2570 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2571 DR_BASE_ADDRESS (drb), 0)
2572 || !dr_equal_offsets_p (dra, drb)
2573 || !gimple_assign_single_p (DR_STMT (dra))
2574 || !gimple_assign_single_p (DR_STMT (drb)))
2575 break;
2576
2577 /* Check that the data-refs have the same constant size and step. */
2578 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2579 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2580 if (!tree_fits_uhwi_p (sza)
2581 || !tree_fits_uhwi_p (szb)
2582 || !tree_int_cst_equal (sza, szb)
2583 || !tree_fits_shwi_p (DR_STEP (dra))
2584 || !tree_fits_shwi_p (DR_STEP (drb))
2585 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2586 break;
2587
2588 /* Do not place the same access in the interleaving chain twice. */
2589 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2590 break;
2591
2592 /* Check the types are compatible.
2593 ??? We don't distinguish this during sorting. */
2594 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2595 TREE_TYPE (DR_REF (drb))))
2596 break;
2597
2598 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2599 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2600 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2601 gcc_assert (init_a < init_b);
2602
2603 /* If init_b == init_a + the size of the type * k, we have an
2604 interleaving, and DRA is accessed before DRB. */
2605 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2606 if ((init_b - init_a) % type_size_a != 0)
2607 break;
2608
2609 /* The step (if not zero) is greater than the difference between
2610 data-refs' inits. This splits groups into suitable sizes. */
2611 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2612 if (step != 0 && step <= (init_b - init_a))
2613 break;
2614
2615 if (dump_enabled_p ())
2616 {
2617 dump_printf_loc (MSG_NOTE, vect_location,
2618 "Detected interleaving ");
2619 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2620 dump_printf (MSG_NOTE, " and ");
2621 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2622 dump_printf (MSG_NOTE, "\n");
2623 }
2624
2625 /* Link the found element into the group list. */
2626 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2627 {
2628 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2629 lastinfo = stmtinfo_a;
2630 }
2631 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2632 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2633 lastinfo = stmtinfo_b;
2634 }
2635 }
2636
2637 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2638 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2639 && !vect_analyze_data_ref_access (dr))
2640 {
2641 if (dump_enabled_p ())
2642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2643 "not vectorized: complicated access pattern.\n");
2644
2645 if (bb_vinfo)
2646 {
2647 /* Mark the statement as not vectorizable. */
2648 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2649 continue;
2650 }
2651 else
2652 {
2653 datarefs_copy.release ();
2654 return false;
2655 }
2656 }
2657
2658 datarefs_copy.release ();
2659 return true;
2660 }
2661
2662
2663 /* Operator == between two dr_with_seg_len objects.
2664
2665 This equality operator is used to make sure two data refs
2666 are the same one so that we will consider to combine the
2667 aliasing checks of those two pairs of data dependent data
2668 refs. */
2669
2670 static bool
2671 operator == (const dr_with_seg_len& d1,
2672 const dr_with_seg_len& d2)
2673 {
2674 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2675 DR_BASE_ADDRESS (d2.dr), 0)
2676 && compare_tree (d1.offset, d2.offset) == 0
2677 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2678 }
2679
2680 /* Function comp_dr_with_seg_len_pair.
2681
2682 Comparison function for sorting objects of dr_with_seg_len_pair_t
2683 so that we can combine aliasing checks in one scan. */
2684
2685 static int
2686 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2687 {
2688 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2689 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2690
2691 const dr_with_seg_len &p11 = p1->first,
2692 &p12 = p1->second,
2693 &p21 = p2->first,
2694 &p22 = p2->second;
2695
2696 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2697 if a and c have the same basic address snd step, and b and d have the same
2698 address and step. Therefore, if any a&c or b&d don't have the same address
2699 and step, we don't care the order of those two pairs after sorting. */
2700 int comp_res;
2701
2702 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2703 DR_BASE_ADDRESS (p21.dr))) != 0)
2704 return comp_res;
2705 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2706 DR_BASE_ADDRESS (p22.dr))) != 0)
2707 return comp_res;
2708 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2709 return comp_res;
2710 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2711 return comp_res;
2712 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2713 return comp_res;
2714 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2715 return comp_res;
2716
2717 return 0;
2718 }
2719
2720 /* Function vect_vfa_segment_size.
2721
2722 Create an expression that computes the size of segment
2723 that will be accessed for a data reference. The functions takes into
2724 account that realignment loads may access one more vector.
2725
2726 Input:
2727 DR: The data reference.
2728 LENGTH_FACTOR: segment length to consider.
2729
2730 Return an expression whose value is the size of segment which will be
2731 accessed by DR. */
2732
2733 static tree
2734 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2735 {
2736 tree segment_length;
2737
2738 if (integer_zerop (DR_STEP (dr)))
2739 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2740 else
2741 segment_length = size_binop (MULT_EXPR,
2742 fold_convert (sizetype, DR_STEP (dr)),
2743 fold_convert (sizetype, length_factor));
2744
2745 if (vect_supportable_dr_alignment (dr, false)
2746 == dr_explicit_realign_optimized)
2747 {
2748 tree vector_size = TYPE_SIZE_UNIT
2749 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2750
2751 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2752 }
2753 return segment_length;
2754 }
2755
2756 /* Function vect_prune_runtime_alias_test_list.
2757
2758 Prune a list of ddrs to be tested at run-time by versioning for alias.
2759 Merge several alias checks into one if possible.
2760 Return FALSE if resulting list of ddrs is longer then allowed by
2761 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2762
2763 bool
2764 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2765 {
2766 vec<ddr_p> may_alias_ddrs =
2767 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2768 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2769 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2770 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2771 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2772
2773 ddr_p ddr;
2774 unsigned int i;
2775 tree length_factor;
2776
2777 if (dump_enabled_p ())
2778 dump_printf_loc (MSG_NOTE, vect_location,
2779 "=== vect_prune_runtime_alias_test_list ===\n");
2780
2781 if (may_alias_ddrs.is_empty ())
2782 return true;
2783
2784 /* Basically, for each pair of dependent data refs store_ptr_0
2785 and load_ptr_0, we create an expression:
2786
2787 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2788 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2789
2790 for aliasing checks. However, in some cases we can decrease
2791 the number of checks by combining two checks into one. For
2792 example, suppose we have another pair of data refs store_ptr_0
2793 and load_ptr_1, and if the following condition is satisfied:
2794
2795 load_ptr_0 < load_ptr_1 &&
2796 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2797
2798 (this condition means, in each iteration of vectorized loop,
2799 the accessed memory of store_ptr_0 cannot be between the memory
2800 of load_ptr_0 and load_ptr_1.)
2801
2802 we then can use only the following expression to finish the
2803 alising checks between store_ptr_0 & load_ptr_0 and
2804 store_ptr_0 & load_ptr_1:
2805
2806 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2807 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2808
2809 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2810 same basic address. */
2811
2812 comp_alias_ddrs.create (may_alias_ddrs.length ());
2813
2814 /* First, we collect all data ref pairs for aliasing checks. */
2815 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2816 {
2817 struct data_reference *dr_a, *dr_b;
2818 gimple dr_group_first_a, dr_group_first_b;
2819 tree segment_length_a, segment_length_b;
2820 gimple stmt_a, stmt_b;
2821
2822 dr_a = DDR_A (ddr);
2823 stmt_a = DR_STMT (DDR_A (ddr));
2824 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2825 if (dr_group_first_a)
2826 {
2827 stmt_a = dr_group_first_a;
2828 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2829 }
2830
2831 dr_b = DDR_B (ddr);
2832 stmt_b = DR_STMT (DDR_B (ddr));
2833 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2834 if (dr_group_first_b)
2835 {
2836 stmt_b = dr_group_first_b;
2837 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2838 }
2839
2840 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2841 length_factor = scalar_loop_iters;
2842 else
2843 length_factor = size_int (vect_factor);
2844 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2845 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2846
2847 dr_with_seg_len_pair_t dr_with_seg_len_pair
2848 (dr_with_seg_len (dr_a, segment_length_a),
2849 dr_with_seg_len (dr_b, segment_length_b));
2850
2851 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2852 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2853
2854 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2855 }
2856
2857 /* Second, we sort the collected data ref pairs so that we can scan
2858 them once to combine all possible aliasing checks. */
2859 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2860
2861 /* Third, we scan the sorted dr pairs and check if we can combine
2862 alias checks of two neighbouring dr pairs. */
2863 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2864 {
2865 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2866 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2867 *dr_b1 = &comp_alias_ddrs[i-1].second,
2868 *dr_a2 = &comp_alias_ddrs[i].first,
2869 *dr_b2 = &comp_alias_ddrs[i].second;
2870
2871 /* Remove duplicate data ref pairs. */
2872 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2873 {
2874 if (dump_enabled_p ())
2875 {
2876 dump_printf_loc (MSG_NOTE, vect_location,
2877 "found equal ranges ");
2878 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2879 DR_REF (dr_a1->dr));
2880 dump_printf (MSG_NOTE, ", ");
2881 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2882 DR_REF (dr_b1->dr));
2883 dump_printf (MSG_NOTE, " and ");
2884 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2885 DR_REF (dr_a2->dr));
2886 dump_printf (MSG_NOTE, ", ");
2887 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2888 DR_REF (dr_b2->dr));
2889 dump_printf (MSG_NOTE, "\n");
2890 }
2891
2892 comp_alias_ddrs.ordered_remove (i--);
2893 continue;
2894 }
2895
2896 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2897 {
2898 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2899 and DR_A1 and DR_A2 are two consecutive memrefs. */
2900 if (*dr_a1 == *dr_a2)
2901 {
2902 std::swap (dr_a1, dr_b1);
2903 std::swap (dr_a2, dr_b2);
2904 }
2905
2906 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2907 DR_BASE_ADDRESS (dr_a2->dr),
2908 0)
2909 || !tree_fits_shwi_p (dr_a1->offset)
2910 || !tree_fits_shwi_p (dr_a2->offset))
2911 continue;
2912
2913 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2914 - tree_to_shwi (dr_a1->offset));
2915
2916
2917 /* Now we check if the following condition is satisfied:
2918
2919 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2920
2921 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2922 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2923 have to make a best estimation. We can get the minimum value
2924 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2925 then either of the following two conditions can guarantee the
2926 one above:
2927
2928 1: DIFF <= MIN_SEG_LEN_B
2929 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2930
2931 */
2932
2933 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2934 ? tree_to_shwi (dr_b1->seg_len)
2935 : vect_factor);
2936
2937 if (diff <= min_seg_len_b
2938 || (tree_fits_shwi_p (dr_a1->seg_len)
2939 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2940 {
2941 if (dump_enabled_p ())
2942 {
2943 dump_printf_loc (MSG_NOTE, vect_location,
2944 "merging ranges for ");
2945 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2946 DR_REF (dr_a1->dr));
2947 dump_printf (MSG_NOTE, ", ");
2948 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2949 DR_REF (dr_b1->dr));
2950 dump_printf (MSG_NOTE, " and ");
2951 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2952 DR_REF (dr_a2->dr));
2953 dump_printf (MSG_NOTE, ", ");
2954 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2955 DR_REF (dr_b2->dr));
2956 dump_printf (MSG_NOTE, "\n");
2957 }
2958
2959 dr_a1->seg_len = size_binop (PLUS_EXPR,
2960 dr_a2->seg_len, size_int (diff));
2961 comp_alias_ddrs.ordered_remove (i--);
2962 }
2963 }
2964 }
2965
2966 dump_printf_loc (MSG_NOTE, vect_location,
2967 "improved number of alias checks from %d to %d\n",
2968 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2969 if ((int) comp_alias_ddrs.length () >
2970 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2971 return false;
2972
2973 return true;
2974 }
2975
2976 /* Check whether a non-affine read in stmt is suitable for gather load
2977 and if so, return a builtin decl for that operation. */
2978
2979 tree
2980 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2981 tree *offp, int *scalep)
2982 {
2983 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2984 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2985 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2986 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2987 tree offtype = NULL_TREE;
2988 tree decl, base, off;
2989 machine_mode pmode;
2990 int punsignedp, pvolatilep;
2991
2992 base = DR_REF (dr);
2993 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2994 see if we can use the def stmt of the address. */
2995 if (is_gimple_call (stmt)
2996 && gimple_call_internal_p (stmt)
2997 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2998 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2999 && TREE_CODE (base) == MEM_REF
3000 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3001 && integer_zerop (TREE_OPERAND (base, 1))
3002 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3003 {
3004 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3005 if (is_gimple_assign (def_stmt)
3006 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3007 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3008 }
3009
3010 /* The gather builtins need address of the form
3011 loop_invariant + vector * {1, 2, 4, 8}
3012 or
3013 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3014 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3015 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3016 multiplications and additions in it. To get a vector, we need
3017 a single SSA_NAME that will be defined in the loop and will
3018 contain everything that is not loop invariant and that can be
3019 vectorized. The following code attempts to find such a preexistng
3020 SSA_NAME OFF and put the loop invariants into a tree BASE
3021 that can be gimplified before the loop. */
3022 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3023 &pmode, &punsignedp, &pvolatilep, false);
3024 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3025
3026 if (TREE_CODE (base) == MEM_REF)
3027 {
3028 if (!integer_zerop (TREE_OPERAND (base, 1)))
3029 {
3030 if (off == NULL_TREE)
3031 {
3032 offset_int moff = mem_ref_offset (base);
3033 off = wide_int_to_tree (sizetype, moff);
3034 }
3035 else
3036 off = size_binop (PLUS_EXPR, off,
3037 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3038 }
3039 base = TREE_OPERAND (base, 0);
3040 }
3041 else
3042 base = build_fold_addr_expr (base);
3043
3044 if (off == NULL_TREE)
3045 off = size_zero_node;
3046
3047 /* If base is not loop invariant, either off is 0, then we start with just
3048 the constant offset in the loop invariant BASE and continue with base
3049 as OFF, otherwise give up.
3050 We could handle that case by gimplifying the addition of base + off
3051 into some SSA_NAME and use that as off, but for now punt. */
3052 if (!expr_invariant_in_loop_p (loop, base))
3053 {
3054 if (!integer_zerop (off))
3055 return NULL_TREE;
3056 off = base;
3057 base = size_int (pbitpos / BITS_PER_UNIT);
3058 }
3059 /* Otherwise put base + constant offset into the loop invariant BASE
3060 and continue with OFF. */
3061 else
3062 {
3063 base = fold_convert (sizetype, base);
3064 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3065 }
3066
3067 /* OFF at this point may be either a SSA_NAME or some tree expression
3068 from get_inner_reference. Try to peel off loop invariants from it
3069 into BASE as long as possible. */
3070 STRIP_NOPS (off);
3071 while (offtype == NULL_TREE)
3072 {
3073 enum tree_code code;
3074 tree op0, op1, add = NULL_TREE;
3075
3076 if (TREE_CODE (off) == SSA_NAME)
3077 {
3078 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3079
3080 if (expr_invariant_in_loop_p (loop, off))
3081 return NULL_TREE;
3082
3083 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3084 break;
3085
3086 op0 = gimple_assign_rhs1 (def_stmt);
3087 code = gimple_assign_rhs_code (def_stmt);
3088 op1 = gimple_assign_rhs2 (def_stmt);
3089 }
3090 else
3091 {
3092 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3093 return NULL_TREE;
3094 code = TREE_CODE (off);
3095 extract_ops_from_tree (off, &code, &op0, &op1);
3096 }
3097 switch (code)
3098 {
3099 case POINTER_PLUS_EXPR:
3100 case PLUS_EXPR:
3101 if (expr_invariant_in_loop_p (loop, op0))
3102 {
3103 add = op0;
3104 off = op1;
3105 do_add:
3106 add = fold_convert (sizetype, add);
3107 if (scale != 1)
3108 add = size_binop (MULT_EXPR, add, size_int (scale));
3109 base = size_binop (PLUS_EXPR, base, add);
3110 continue;
3111 }
3112 if (expr_invariant_in_loop_p (loop, op1))
3113 {
3114 add = op1;
3115 off = op0;
3116 goto do_add;
3117 }
3118 break;
3119 case MINUS_EXPR:
3120 if (expr_invariant_in_loop_p (loop, op1))
3121 {
3122 add = fold_convert (sizetype, op1);
3123 add = size_binop (MINUS_EXPR, size_zero_node, add);
3124 off = op0;
3125 goto do_add;
3126 }
3127 break;
3128 case MULT_EXPR:
3129 if (scale == 1 && tree_fits_shwi_p (op1))
3130 {
3131 scale = tree_to_shwi (op1);
3132 off = op0;
3133 continue;
3134 }
3135 break;
3136 case SSA_NAME:
3137 off = op0;
3138 continue;
3139 CASE_CONVERT:
3140 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3141 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3142 break;
3143 if (TYPE_PRECISION (TREE_TYPE (op0))
3144 == TYPE_PRECISION (TREE_TYPE (off)))
3145 {
3146 off = op0;
3147 continue;
3148 }
3149 if (TYPE_PRECISION (TREE_TYPE (op0))
3150 < TYPE_PRECISION (TREE_TYPE (off)))
3151 {
3152 off = op0;
3153 offtype = TREE_TYPE (off);
3154 STRIP_NOPS (off);
3155 continue;
3156 }
3157 break;
3158 default:
3159 break;
3160 }
3161 break;
3162 }
3163
3164 /* If at the end OFF still isn't a SSA_NAME or isn't
3165 defined in the loop, punt. */
3166 if (TREE_CODE (off) != SSA_NAME
3167 || expr_invariant_in_loop_p (loop, off))
3168 return NULL_TREE;
3169
3170 if (offtype == NULL_TREE)
3171 offtype = TREE_TYPE (off);
3172
3173 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3174 offtype, scale);
3175 if (decl == NULL_TREE)
3176 return NULL_TREE;
3177
3178 if (basep)
3179 *basep = base;
3180 if (offp)
3181 *offp = off;
3182 if (scalep)
3183 *scalep = scale;
3184 return decl;
3185 }
3186
3187 /* Function vect_analyze_data_refs.
3188
3189 Find all the data references in the loop or basic block.
3190
3191 The general structure of the analysis of data refs in the vectorizer is as
3192 follows:
3193 1- vect_analyze_data_refs(loop/bb): call
3194 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3195 in the loop/bb and their dependences.
3196 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3197 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3198 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3199
3200 */
3201
3202 bool
3203 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3204 bb_vec_info bb_vinfo,
3205 int *min_vf, unsigned *n_stmts)
3206 {
3207 struct loop *loop = NULL;
3208 basic_block bb = NULL;
3209 unsigned int i;
3210 vec<data_reference_p> datarefs;
3211 struct data_reference *dr;
3212 tree scalar_type;
3213
3214 if (dump_enabled_p ())
3215 dump_printf_loc (MSG_NOTE, vect_location,
3216 "=== vect_analyze_data_refs ===\n");
3217
3218 if (loop_vinfo)
3219 {
3220 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3221
3222 loop = LOOP_VINFO_LOOP (loop_vinfo);
3223 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3224 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3225 {
3226 if (dump_enabled_p ())
3227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3228 "not vectorized: loop contains function calls"
3229 " or data references that cannot be analyzed\n");
3230 return false;
3231 }
3232
3233 for (i = 0; i < loop->num_nodes; i++)
3234 {
3235 gimple_stmt_iterator gsi;
3236
3237 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3238 {
3239 gimple stmt = gsi_stmt (gsi);
3240 if (is_gimple_debug (stmt))
3241 continue;
3242 ++*n_stmts;
3243 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3244 {
3245 if (is_gimple_call (stmt) && loop->safelen)
3246 {
3247 tree fndecl = gimple_call_fndecl (stmt), op;
3248 if (fndecl != NULL_TREE)
3249 {
3250 struct cgraph_node *node = cgraph_node::get (fndecl);
3251 if (node != NULL && node->simd_clones != NULL)
3252 {
3253 unsigned int j, n = gimple_call_num_args (stmt);
3254 for (j = 0; j < n; j++)
3255 {
3256 op = gimple_call_arg (stmt, j);
3257 if (DECL_P (op)
3258 || (REFERENCE_CLASS_P (op)
3259 && get_base_address (op)))
3260 break;
3261 }
3262 op = gimple_call_lhs (stmt);
3263 /* Ignore #pragma omp declare simd functions
3264 if they don't have data references in the
3265 call stmt itself. */
3266 if (j == n
3267 && !(op
3268 && (DECL_P (op)
3269 || (REFERENCE_CLASS_P (op)
3270 && get_base_address (op)))))
3271 continue;
3272 }
3273 }
3274 }
3275 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3276 if (dump_enabled_p ())
3277 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278 "not vectorized: loop contains function "
3279 "calls or data references that cannot "
3280 "be analyzed\n");
3281 return false;
3282 }
3283 }
3284 }
3285
3286 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3287 }
3288 else
3289 {
3290 gimple_stmt_iterator gsi;
3291
3292 bb = BB_VINFO_BB (bb_vinfo);
3293 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3294 {
3295 gimple stmt = gsi_stmt (gsi);
3296 if (is_gimple_debug (stmt))
3297 continue;
3298 ++*n_stmts;
3299 if (!find_data_references_in_stmt (NULL, stmt,
3300 &BB_VINFO_DATAREFS (bb_vinfo)))
3301 {
3302 /* Mark the rest of the basic-block as unvectorizable. */
3303 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3304 {
3305 stmt = gsi_stmt (gsi);
3306 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3307 }
3308 break;
3309 }
3310 }
3311
3312 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3313 }
3314
3315 /* Go through the data-refs, check that the analysis succeeded. Update
3316 pointer from stmt_vec_info struct to DR and vectype. */
3317
3318 FOR_EACH_VEC_ELT (datarefs, i, dr)
3319 {
3320 gimple stmt;
3321 stmt_vec_info stmt_info;
3322 tree base, offset, init;
3323 bool gather = false;
3324 bool simd_lane_access = false;
3325 int vf;
3326
3327 again:
3328 if (!dr || !DR_REF (dr))
3329 {
3330 if (dump_enabled_p ())
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332 "not vectorized: unhandled data-ref\n");
3333 return false;
3334 }
3335
3336 stmt = DR_STMT (dr);
3337 stmt_info = vinfo_for_stmt (stmt);
3338
3339 /* Discard clobbers from the dataref vector. We will remove
3340 clobber stmts during vectorization. */
3341 if (gimple_clobber_p (stmt))
3342 {
3343 free_data_ref (dr);
3344 if (i == datarefs.length () - 1)
3345 {
3346 datarefs.pop ();
3347 break;
3348 }
3349 datarefs.ordered_remove (i);
3350 dr = datarefs[i];
3351 goto again;
3352 }
3353
3354 /* Check that analysis of the data-ref succeeded. */
3355 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3356 || !DR_STEP (dr))
3357 {
3358 bool maybe_gather
3359 = DR_IS_READ (dr)
3360 && !TREE_THIS_VOLATILE (DR_REF (dr))
3361 && targetm.vectorize.builtin_gather != NULL;
3362 bool maybe_simd_lane_access
3363 = loop_vinfo && loop->simduid;
3364
3365 /* If target supports vector gather loads, or if this might be
3366 a SIMD lane access, see if they can't be used. */
3367 if (loop_vinfo
3368 && (maybe_gather || maybe_simd_lane_access)
3369 && !nested_in_vect_loop_p (loop, stmt))
3370 {
3371 struct data_reference *newdr
3372 = create_data_ref (NULL, loop_containing_stmt (stmt),
3373 DR_REF (dr), stmt, true);
3374 gcc_assert (newdr != NULL && DR_REF (newdr));
3375 if (DR_BASE_ADDRESS (newdr)
3376 && DR_OFFSET (newdr)
3377 && DR_INIT (newdr)
3378 && DR_STEP (newdr)
3379 && integer_zerop (DR_STEP (newdr)))
3380 {
3381 if (maybe_simd_lane_access)
3382 {
3383 tree off = DR_OFFSET (newdr);
3384 STRIP_NOPS (off);
3385 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3386 && TREE_CODE (off) == MULT_EXPR
3387 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3388 {
3389 tree step = TREE_OPERAND (off, 1);
3390 off = TREE_OPERAND (off, 0);
3391 STRIP_NOPS (off);
3392 if (CONVERT_EXPR_P (off)
3393 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3394 0)))
3395 < TYPE_PRECISION (TREE_TYPE (off)))
3396 off = TREE_OPERAND (off, 0);
3397 if (TREE_CODE (off) == SSA_NAME)
3398 {
3399 gimple def = SSA_NAME_DEF_STMT (off);
3400 tree reft = TREE_TYPE (DR_REF (newdr));
3401 if (is_gimple_call (def)
3402 && gimple_call_internal_p (def)
3403 && (gimple_call_internal_fn (def)
3404 == IFN_GOMP_SIMD_LANE))
3405 {
3406 tree arg = gimple_call_arg (def, 0);
3407 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3408 arg = SSA_NAME_VAR (arg);
3409 if (arg == loop->simduid
3410 /* For now. */
3411 && tree_int_cst_equal
3412 (TYPE_SIZE_UNIT (reft),
3413 step))
3414 {
3415 DR_OFFSET (newdr) = ssize_int (0);
3416 DR_STEP (newdr) = step;
3417 DR_ALIGNED_TO (newdr)
3418 = size_int (BIGGEST_ALIGNMENT);
3419 dr = newdr;
3420 simd_lane_access = true;
3421 }
3422 }
3423 }
3424 }
3425 }
3426 if (!simd_lane_access && maybe_gather)
3427 {
3428 dr = newdr;
3429 gather = true;
3430 }
3431 }
3432 if (!gather && !simd_lane_access)
3433 free_data_ref (newdr);
3434 }
3435
3436 if (!gather && !simd_lane_access)
3437 {
3438 if (dump_enabled_p ())
3439 {
3440 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3441 "not vectorized: data ref analysis "
3442 "failed ");
3443 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3444 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3445 }
3446
3447 if (bb_vinfo)
3448 break;
3449
3450 return false;
3451 }
3452 }
3453
3454 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3455 {
3456 if (dump_enabled_p ())
3457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3458 "not vectorized: base addr of dr is a "
3459 "constant\n");
3460
3461 if (bb_vinfo)
3462 break;
3463
3464 if (gather || simd_lane_access)
3465 free_data_ref (dr);
3466 return false;
3467 }
3468
3469 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3470 {
3471 if (dump_enabled_p ())
3472 {
3473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3474 "not vectorized: volatile type ");
3475 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3476 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3477 }
3478
3479 if (bb_vinfo)
3480 break;
3481
3482 return false;
3483 }
3484
3485 if (stmt_can_throw_internal (stmt))
3486 {
3487 if (dump_enabled_p ())
3488 {
3489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3490 "not vectorized: statement can throw an "
3491 "exception ");
3492 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3493 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3494 }
3495
3496 if (bb_vinfo)
3497 break;
3498
3499 if (gather || simd_lane_access)
3500 free_data_ref (dr);
3501 return false;
3502 }
3503
3504 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3505 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3506 {
3507 if (dump_enabled_p ())
3508 {
3509 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3510 "not vectorized: statement is bitfield "
3511 "access ");
3512 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3513 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3514 }
3515
3516 if (bb_vinfo)
3517 break;
3518
3519 if (gather || simd_lane_access)
3520 free_data_ref (dr);
3521 return false;
3522 }
3523
3524 base = unshare_expr (DR_BASE_ADDRESS (dr));
3525 offset = unshare_expr (DR_OFFSET (dr));
3526 init = unshare_expr (DR_INIT (dr));
3527
3528 if (is_gimple_call (stmt)
3529 && (!gimple_call_internal_p (stmt)
3530 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3531 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3532 {
3533 if (dump_enabled_p ())
3534 {
3535 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3536 "not vectorized: dr in a call ");
3537 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3538 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3539 }
3540
3541 if (bb_vinfo)
3542 break;
3543
3544 if (gather || simd_lane_access)
3545 free_data_ref (dr);
3546 return false;
3547 }
3548
3549 /* Update DR field in stmt_vec_info struct. */
3550
3551 /* If the dataref is in an inner-loop of the loop that is considered for
3552 for vectorization, we also want to analyze the access relative to
3553 the outer-loop (DR contains information only relative to the
3554 inner-most enclosing loop). We do that by building a reference to the
3555 first location accessed by the inner-loop, and analyze it relative to
3556 the outer-loop. */
3557 if (loop && nested_in_vect_loop_p (loop, stmt))
3558 {
3559 tree outer_step, outer_base, outer_init;
3560 HOST_WIDE_INT pbitsize, pbitpos;
3561 tree poffset;
3562 machine_mode pmode;
3563 int punsignedp, pvolatilep;
3564 affine_iv base_iv, offset_iv;
3565 tree dinit;
3566
3567 /* Build a reference to the first location accessed by the
3568 inner-loop: *(BASE+INIT). (The first location is actually
3569 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3570 tree inner_base = build_fold_indirect_ref
3571 (fold_build_pointer_plus (base, init));
3572
3573 if (dump_enabled_p ())
3574 {
3575 dump_printf_loc (MSG_NOTE, vect_location,
3576 "analyze in outer-loop: ");
3577 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3578 dump_printf (MSG_NOTE, "\n");
3579 }
3580
3581 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3582 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3583 gcc_assert (outer_base != NULL_TREE);
3584
3585 if (pbitpos % BITS_PER_UNIT != 0)
3586 {
3587 if (dump_enabled_p ())
3588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3589 "failed: bit offset alignment.\n");
3590 return false;
3591 }
3592
3593 outer_base = build_fold_addr_expr (outer_base);
3594 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3595 &base_iv, false))
3596 {
3597 if (dump_enabled_p ())
3598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3599 "failed: evolution of base is not affine.\n");
3600 return false;
3601 }
3602
3603 if (offset)
3604 {
3605 if (poffset)
3606 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3607 poffset);
3608 else
3609 poffset = offset;
3610 }
3611
3612 if (!poffset)
3613 {
3614 offset_iv.base = ssize_int (0);
3615 offset_iv.step = ssize_int (0);
3616 }
3617 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3618 &offset_iv, false))
3619 {
3620 if (dump_enabled_p ())
3621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3622 "evolution of offset is not affine.\n");
3623 return false;
3624 }
3625
3626 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3627 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3628 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3629 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3630 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3631
3632 outer_step = size_binop (PLUS_EXPR,
3633 fold_convert (ssizetype, base_iv.step),
3634 fold_convert (ssizetype, offset_iv.step));
3635
3636 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3637 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3638 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3639 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3640 STMT_VINFO_DR_OFFSET (stmt_info) =
3641 fold_convert (ssizetype, offset_iv.base);
3642 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3643 size_int (highest_pow2_factor (offset_iv.base));
3644
3645 if (dump_enabled_p ())
3646 {
3647 dump_printf_loc (MSG_NOTE, vect_location,
3648 "\touter base_address: ");
3649 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3651 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3652 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653 STMT_VINFO_DR_OFFSET (stmt_info));
3654 dump_printf (MSG_NOTE,
3655 "\n\touter constant offset from base address: ");
3656 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3657 STMT_VINFO_DR_INIT (stmt_info));
3658 dump_printf (MSG_NOTE, "\n\touter step: ");
3659 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3660 STMT_VINFO_DR_STEP (stmt_info));
3661 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3662 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3663 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3664 dump_printf (MSG_NOTE, "\n");
3665 }
3666 }
3667
3668 if (STMT_VINFO_DATA_REF (stmt_info))
3669 {
3670 if (dump_enabled_p ())
3671 {
3672 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3673 "not vectorized: more than one data ref "
3674 "in stmt: ");
3675 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3676 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3677 }
3678
3679 if (bb_vinfo)
3680 break;
3681
3682 if (gather || simd_lane_access)
3683 free_data_ref (dr);
3684 return false;
3685 }
3686
3687 STMT_VINFO_DATA_REF (stmt_info) = dr;
3688 if (simd_lane_access)
3689 {
3690 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3691 free_data_ref (datarefs[i]);
3692 datarefs[i] = dr;
3693 }
3694
3695 /* Set vectype for STMT. */
3696 scalar_type = TREE_TYPE (DR_REF (dr));
3697 STMT_VINFO_VECTYPE (stmt_info)
3698 = get_vectype_for_scalar_type (scalar_type);
3699 if (!STMT_VINFO_VECTYPE (stmt_info))
3700 {
3701 if (dump_enabled_p ())
3702 {
3703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3704 "not vectorized: no vectype for stmt: ");
3705 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3706 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3707 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3708 scalar_type);
3709 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3710 }
3711
3712 if (bb_vinfo)
3713 break;
3714
3715 if (gather || simd_lane_access)
3716 {
3717 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3718 if (gather)
3719 free_data_ref (dr);
3720 }
3721 return false;
3722 }
3723 else
3724 {
3725 if (dump_enabled_p ())
3726 {
3727 dump_printf_loc (MSG_NOTE, vect_location,
3728 "got vectype for stmt: ");
3729 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3730 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3731 STMT_VINFO_VECTYPE (stmt_info));
3732 dump_printf (MSG_NOTE, "\n");
3733 }
3734 }
3735
3736 /* Adjust the minimal vectorization factor according to the
3737 vector type. */
3738 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3739 if (vf > *min_vf)
3740 *min_vf = vf;
3741
3742 if (gather)
3743 {
3744 tree off;
3745
3746 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3747 if (gather
3748 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3749 gather = false;
3750 if (!gather)
3751 {
3752 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3753 free_data_ref (dr);
3754 if (dump_enabled_p ())
3755 {
3756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3757 "not vectorized: not suitable for gather "
3758 "load ");
3759 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3760 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3761 }
3762 return false;
3763 }
3764
3765 datarefs[i] = dr;
3766 STMT_VINFO_GATHER_P (stmt_info) = true;
3767 }
3768 else if (loop_vinfo
3769 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3770 {
3771 if (nested_in_vect_loop_p (loop, stmt)
3772 || !DR_IS_READ (dr))
3773 {
3774 if (dump_enabled_p ())
3775 {
3776 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3777 "not vectorized: not suitable for strided "
3778 "load ");
3779 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3780 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3781 }
3782 return false;
3783 }
3784 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3785 }
3786 }
3787
3788 /* If we stopped analysis at the first dataref we could not analyze
3789 when trying to vectorize a basic-block mark the rest of the datarefs
3790 as not vectorizable and truncate the vector of datarefs. That
3791 avoids spending useless time in analyzing their dependence. */
3792 if (i != datarefs.length ())
3793 {
3794 gcc_assert (bb_vinfo != NULL);
3795 for (unsigned j = i; j < datarefs.length (); ++j)
3796 {
3797 data_reference_p dr = datarefs[j];
3798 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3799 free_data_ref (dr);
3800 }
3801 datarefs.truncate (i);
3802 }
3803
3804 return true;
3805 }
3806
3807
3808 /* Function vect_get_new_vect_var.
3809
3810 Returns a name for a new variable. The current naming scheme appends the
3811 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3812 the name of vectorizer generated variables, and appends that to NAME if
3813 provided. */
3814
3815 tree
3816 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3817 {
3818 const char *prefix;
3819 tree new_vect_var;
3820
3821 switch (var_kind)
3822 {
3823 case vect_simple_var:
3824 prefix = "vect";
3825 break;
3826 case vect_scalar_var:
3827 prefix = "stmp";
3828 break;
3829 case vect_pointer_var:
3830 prefix = "vectp";
3831 break;
3832 default:
3833 gcc_unreachable ();
3834 }
3835
3836 if (name)
3837 {
3838 char* tmp = concat (prefix, "_", name, NULL);
3839 new_vect_var = create_tmp_reg (type, tmp);
3840 free (tmp);
3841 }
3842 else
3843 new_vect_var = create_tmp_reg (type, prefix);
3844
3845 return new_vect_var;
3846 }
3847
3848 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3849
3850 static void
3851 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3852 stmt_vec_info stmt_info)
3853 {
3854 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3855 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3856 int misalign = DR_MISALIGNMENT (dr);
3857 if (misalign == -1)
3858 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3859 else
3860 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3861 }
3862
3863 /* Function vect_create_addr_base_for_vector_ref.
3864
3865 Create an expression that computes the address of the first memory location
3866 that will be accessed for a data reference.
3867
3868 Input:
3869 STMT: The statement containing the data reference.
3870 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3871 OFFSET: Optional. If supplied, it is be added to the initial address.
3872 LOOP: Specify relative to which loop-nest should the address be computed.
3873 For example, when the dataref is in an inner-loop nested in an
3874 outer-loop that is now being vectorized, LOOP can be either the
3875 outer-loop, or the inner-loop. The first memory location accessed
3876 by the following dataref ('in' points to short):
3877
3878 for (i=0; i<N; i++)
3879 for (j=0; j<M; j++)
3880 s += in[i+j]
3881
3882 is as follows:
3883 if LOOP=i_loop: &in (relative to i_loop)
3884 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3885 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3886 initial address. Unlike OFFSET, which is number of elements to
3887 be added, BYTE_OFFSET is measured in bytes.
3888
3889 Output:
3890 1. Return an SSA_NAME whose value is the address of the memory location of
3891 the first vector of the data reference.
3892 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3893 these statement(s) which define the returned SSA_NAME.
3894
3895 FORNOW: We are only handling array accesses with step 1. */
3896
3897 tree
3898 vect_create_addr_base_for_vector_ref (gimple stmt,
3899 gimple_seq *new_stmt_list,
3900 tree offset,
3901 struct loop *loop,
3902 tree byte_offset)
3903 {
3904 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3905 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3906 tree data_ref_base;
3907 const char *base_name;
3908 tree addr_base;
3909 tree dest;
3910 gimple_seq seq = NULL;
3911 tree base_offset;
3912 tree init;
3913 tree vect_ptr_type;
3914 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3915 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3916
3917 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3918 {
3919 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3920
3921 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3922
3923 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3924 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3925 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3926 }
3927 else
3928 {
3929 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3930 base_offset = unshare_expr (DR_OFFSET (dr));
3931 init = unshare_expr (DR_INIT (dr));
3932 }
3933
3934 if (loop_vinfo)
3935 base_name = get_name (data_ref_base);
3936 else
3937 {
3938 base_offset = ssize_int (0);
3939 init = ssize_int (0);
3940 base_name = get_name (DR_REF (dr));
3941 }
3942
3943 /* Create base_offset */
3944 base_offset = size_binop (PLUS_EXPR,
3945 fold_convert (sizetype, base_offset),
3946 fold_convert (sizetype, init));
3947
3948 if (offset)
3949 {
3950 offset = fold_build2 (MULT_EXPR, sizetype,
3951 fold_convert (sizetype, offset), step);
3952 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3953 base_offset, offset);
3954 }
3955 if (byte_offset)
3956 {
3957 byte_offset = fold_convert (sizetype, byte_offset);
3958 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3959 base_offset, byte_offset);
3960 }
3961
3962 /* base + base_offset */
3963 if (loop_vinfo)
3964 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3965 else
3966 {
3967 addr_base = build1 (ADDR_EXPR,
3968 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3969 unshare_expr (DR_REF (dr)));
3970 }
3971
3972 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3973 addr_base = fold_convert (vect_ptr_type, addr_base);
3974 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3975 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3976 gimple_seq_add_seq (new_stmt_list, seq);
3977
3978 if (DR_PTR_INFO (dr)
3979 && TREE_CODE (addr_base) == SSA_NAME)
3980 {
3981 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3982 if (offset || byte_offset)
3983 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3984 }
3985
3986 if (dump_enabled_p ())
3987 {
3988 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3989 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3990 dump_printf (MSG_NOTE, "\n");
3991 }
3992
3993 return addr_base;
3994 }
3995
3996
3997 /* Function vect_create_data_ref_ptr.
3998
3999 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4000 location accessed in the loop by STMT, along with the def-use update
4001 chain to appropriately advance the pointer through the loop iterations.
4002 Also set aliasing information for the pointer. This pointer is used by
4003 the callers to this function to create a memory reference expression for
4004 vector load/store access.
4005
4006 Input:
4007 1. STMT: a stmt that references memory. Expected to be of the form
4008 GIMPLE_ASSIGN <name, data-ref> or
4009 GIMPLE_ASSIGN <data-ref, name>.
4010 2. AGGR_TYPE: the type of the reference, which should be either a vector
4011 or an array.
4012 3. AT_LOOP: the loop where the vector memref is to be created.
4013 4. OFFSET (optional): an offset to be added to the initial address accessed
4014 by the data-ref in STMT.
4015 5. BSI: location where the new stmts are to be placed if there is no loop
4016 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4017 pointing to the initial address.
4018 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4019 to the initial address accessed by the data-ref in STMT. This is
4020 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4021 in bytes.
4022
4023 Output:
4024 1. Declare a new ptr to vector_type, and have it point to the base of the
4025 data reference (initial addressed accessed by the data reference).
4026 For example, for vector of type V8HI, the following code is generated:
4027
4028 v8hi *ap;
4029 ap = (v8hi *)initial_address;
4030
4031 if OFFSET is not supplied:
4032 initial_address = &a[init];
4033 if OFFSET is supplied:
4034 initial_address = &a[init + OFFSET];
4035 if BYTE_OFFSET is supplied:
4036 initial_address = &a[init] + BYTE_OFFSET;
4037
4038 Return the initial_address in INITIAL_ADDRESS.
4039
4040 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4041 update the pointer in each iteration of the loop.
4042
4043 Return the increment stmt that updates the pointer in PTR_INCR.
4044
4045 3. Set INV_P to true if the access pattern of the data reference in the
4046 vectorized loop is invariant. Set it to false otherwise.
4047
4048 4. Return the pointer. */
4049
4050 tree
4051 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4052 tree offset, tree *initial_address,
4053 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4054 bool only_init, bool *inv_p, tree byte_offset)
4055 {
4056 const char *base_name;
4057 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4058 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4059 struct loop *loop = NULL;
4060 bool nested_in_vect_loop = false;
4061 struct loop *containing_loop = NULL;
4062 tree aggr_ptr_type;
4063 tree aggr_ptr;
4064 tree new_temp;
4065 gimple vec_stmt;
4066 gimple_seq new_stmt_list = NULL;
4067 edge pe = NULL;
4068 basic_block new_bb;
4069 tree aggr_ptr_init;
4070 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4071 tree aptr;
4072 gimple_stmt_iterator incr_gsi;
4073 bool insert_after;
4074 tree indx_before_incr, indx_after_incr;
4075 gimple incr;
4076 tree step;
4077 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4078
4079 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4080 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4081
4082 if (loop_vinfo)
4083 {
4084 loop = LOOP_VINFO_LOOP (loop_vinfo);
4085 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4086 containing_loop = (gimple_bb (stmt))->loop_father;
4087 pe = loop_preheader_edge (loop);
4088 }
4089 else
4090 {
4091 gcc_assert (bb_vinfo);
4092 only_init = true;
4093 *ptr_incr = NULL;
4094 }
4095
4096 /* Check the step (evolution) of the load in LOOP, and record
4097 whether it's invariant. */
4098 if (nested_in_vect_loop)
4099 step = STMT_VINFO_DR_STEP (stmt_info);
4100 else
4101 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4102
4103 if (integer_zerop (step))
4104 *inv_p = true;
4105 else
4106 *inv_p = false;
4107
4108 /* Create an expression for the first address accessed by this load
4109 in LOOP. */
4110 base_name = get_name (DR_BASE_ADDRESS (dr));
4111
4112 if (dump_enabled_p ())
4113 {
4114 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4115 dump_printf_loc (MSG_NOTE, vect_location,
4116 "create %s-pointer variable to type: ",
4117 get_tree_code_name (TREE_CODE (aggr_type)));
4118 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4119 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4120 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4121 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4122 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4123 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4124 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4125 else
4126 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4127 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4128 dump_printf (MSG_NOTE, "\n");
4129 }
4130
4131 /* (1) Create the new aggregate-pointer variable.
4132 Vector and array types inherit the alias set of their component
4133 type by default so we need to use a ref-all pointer if the data
4134 reference does not conflict with the created aggregated data
4135 reference because it is not addressable. */
4136 bool need_ref_all = false;
4137 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4138 get_alias_set (DR_REF (dr))))
4139 need_ref_all = true;
4140 /* Likewise for any of the data references in the stmt group. */
4141 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4142 {
4143 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4144 do
4145 {
4146 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4147 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4148 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4149 get_alias_set (DR_REF (sdr))))
4150 {
4151 need_ref_all = true;
4152 break;
4153 }
4154 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4155 }
4156 while (orig_stmt);
4157 }
4158 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4159 need_ref_all);
4160 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4161
4162
4163 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4164 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4165 def-use update cycles for the pointer: one relative to the outer-loop
4166 (LOOP), which is what steps (3) and (4) below do. The other is relative
4167 to the inner-loop (which is the inner-most loop containing the dataref),
4168 and this is done be step (5) below.
4169
4170 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4171 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4172 redundant. Steps (3),(4) create the following:
4173
4174 vp0 = &base_addr;
4175 LOOP: vp1 = phi(vp0,vp2)
4176 ...
4177 ...
4178 vp2 = vp1 + step
4179 goto LOOP
4180
4181 If there is an inner-loop nested in loop, then step (5) will also be
4182 applied, and an additional update in the inner-loop will be created:
4183
4184 vp0 = &base_addr;
4185 LOOP: vp1 = phi(vp0,vp2)
4186 ...
4187 inner: vp3 = phi(vp1,vp4)
4188 vp4 = vp3 + inner_step
4189 if () goto inner
4190 ...
4191 vp2 = vp1 + step
4192 if () goto LOOP */
4193
4194 /* (2) Calculate the initial address of the aggregate-pointer, and set
4195 the aggregate-pointer to point to it before the loop. */
4196
4197 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4198
4199 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4200 offset, loop, byte_offset);
4201 if (new_stmt_list)
4202 {
4203 if (pe)
4204 {
4205 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4206 gcc_assert (!new_bb);
4207 }
4208 else
4209 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4210 }
4211
4212 *initial_address = new_temp;
4213
4214 /* Create: p = (aggr_type *) initial_base */
4215 if (TREE_CODE (new_temp) != SSA_NAME
4216 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
4217 {
4218 vec_stmt = gimple_build_assign (aggr_ptr,
4219 fold_convert (aggr_ptr_type, new_temp));
4220 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
4221 /* Copy the points-to information if it exists. */
4222 if (DR_PTR_INFO (dr))
4223 vect_duplicate_ssa_name_ptr_info (aggr_ptr_init, dr, stmt_info);
4224 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
4225 if (pe)
4226 {
4227 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
4228 gcc_assert (!new_bb);
4229 }
4230 else
4231 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
4232 }
4233 else
4234 aggr_ptr_init = new_temp;
4235
4236 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4237 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4238 inner-loop nested in LOOP (during outer-loop vectorization). */
4239
4240 /* No update in loop is required. */
4241 if (only_init && (!loop_vinfo || at_loop == loop))
4242 aptr = aggr_ptr_init;
4243 else
4244 {
4245 /* The step of the aggregate pointer is the type size. */
4246 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4247 /* One exception to the above is when the scalar step of the load in
4248 LOOP is zero. In this case the step here is also zero. */
4249 if (*inv_p)
4250 iv_step = size_zero_node;
4251 else if (tree_int_cst_sgn (step) == -1)
4252 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4253
4254 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4255
4256 create_iv (aggr_ptr_init,
4257 fold_convert (aggr_ptr_type, iv_step),
4258 aggr_ptr, loop, &incr_gsi, insert_after,
4259 &indx_before_incr, &indx_after_incr);
4260 incr = gsi_stmt (incr_gsi);
4261 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4262
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr))
4265 {
4266 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4267 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4268 }
4269 if (ptr_incr)
4270 *ptr_incr = incr;
4271
4272 aptr = indx_before_incr;
4273 }
4274
4275 if (!nested_in_vect_loop || only_init)
4276 return aptr;
4277
4278
4279 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4280 nested in LOOP, if exists. */
4281
4282 gcc_assert (nested_in_vect_loop);
4283 if (!only_init)
4284 {
4285 standard_iv_increment_position (containing_loop, &incr_gsi,
4286 &insert_after);
4287 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4288 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4289 &indx_after_incr);
4290 incr = gsi_stmt (incr_gsi);
4291 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4292
4293 /* Copy the points-to information if it exists. */
4294 if (DR_PTR_INFO (dr))
4295 {
4296 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4297 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4298 }
4299 if (ptr_incr)
4300 *ptr_incr = incr;
4301
4302 return indx_before_incr;
4303 }
4304 else
4305 gcc_unreachable ();
4306 }
4307
4308
4309 /* Function bump_vector_ptr
4310
4311 Increment a pointer (to a vector type) by vector-size. If requested,
4312 i.e. if PTR-INCR is given, then also connect the new increment stmt
4313 to the existing def-use update-chain of the pointer, by modifying
4314 the PTR_INCR as illustrated below:
4315
4316 The pointer def-use update-chain before this function:
4317 DATAREF_PTR = phi (p_0, p_2)
4318 ....
4319 PTR_INCR: p_2 = DATAREF_PTR + step
4320
4321 The pointer def-use update-chain after this function:
4322 DATAREF_PTR = phi (p_0, p_2)
4323 ....
4324 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4325 ....
4326 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4327
4328 Input:
4329 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4330 in the loop.
4331 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4332 the loop. The increment amount across iterations is expected
4333 to be vector_size.
4334 BSI - location where the new update stmt is to be placed.
4335 STMT - the original scalar memory-access stmt that is being vectorized.
4336 BUMP - optional. The offset by which to bump the pointer. If not given,
4337 the offset is assumed to be vector_size.
4338
4339 Output: Return NEW_DATAREF_PTR as illustrated above.
4340
4341 */
4342
4343 tree
4344 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4345 gimple stmt, tree bump)
4346 {
4347 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4348 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4349 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4350 tree update = TYPE_SIZE_UNIT (vectype);
4351 gassign *incr_stmt;
4352 ssa_op_iter iter;
4353 use_operand_p use_p;
4354 tree new_dataref_ptr;
4355
4356 if (bump)
4357 update = bump;
4358
4359 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4360 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4361 dataref_ptr, update);
4362 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4363
4364 /* Copy the points-to information if it exists. */
4365 if (DR_PTR_INFO (dr))
4366 {
4367 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4368 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4369 }
4370
4371 if (!ptr_incr)
4372 return new_dataref_ptr;
4373
4374 /* Update the vector-pointer's cross-iteration increment. */
4375 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4376 {
4377 tree use = USE_FROM_PTR (use_p);
4378
4379 if (use == dataref_ptr)
4380 SET_USE (use_p, new_dataref_ptr);
4381 else
4382 gcc_assert (tree_int_cst_compare (use, update) == 0);
4383 }
4384
4385 return new_dataref_ptr;
4386 }
4387
4388
4389 /* Function vect_create_destination_var.
4390
4391 Create a new temporary of type VECTYPE. */
4392
4393 tree
4394 vect_create_destination_var (tree scalar_dest, tree vectype)
4395 {
4396 tree vec_dest;
4397 const char *name;
4398 char *new_name;
4399 tree type;
4400 enum vect_var_kind kind;
4401
4402 kind = vectype ? vect_simple_var : vect_scalar_var;
4403 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4404
4405 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4406
4407 name = get_name (scalar_dest);
4408 if (name)
4409 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4410 else
4411 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4412 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4413 free (new_name);
4414
4415 return vec_dest;
4416 }
4417
4418 /* Function vect_grouped_store_supported.
4419
4420 Returns TRUE if interleave high and interleave low permutations
4421 are supported, and FALSE otherwise. */
4422
4423 bool
4424 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4425 {
4426 machine_mode mode = TYPE_MODE (vectype);
4427
4428 /* vect_permute_store_chain requires the group size to be equal to 3 or
4429 be a power of two. */
4430 if (count != 3 && exact_log2 (count) == -1)
4431 {
4432 if (dump_enabled_p ())
4433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4434 "the size of the group of accesses"
4435 " is not a power of 2 or not eqaul to 3\n");
4436 return false;
4437 }
4438
4439 /* Check that the permutation is supported. */
4440 if (VECTOR_MODE_P (mode))
4441 {
4442 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4443 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4444
4445 if (count == 3)
4446 {
4447 unsigned int j0 = 0, j1 = 0, j2 = 0;
4448 unsigned int i, j;
4449
4450 for (j = 0; j < 3; j++)
4451 {
4452 int nelt0 = ((3 - j) * nelt) % 3;
4453 int nelt1 = ((3 - j) * nelt + 1) % 3;
4454 int nelt2 = ((3 - j) * nelt + 2) % 3;
4455 for (i = 0; i < nelt; i++)
4456 {
4457 if (3 * i + nelt0 < nelt)
4458 sel[3 * i + nelt0] = j0++;
4459 if (3 * i + nelt1 < nelt)
4460 sel[3 * i + nelt1] = nelt + j1++;
4461 if (3 * i + nelt2 < nelt)
4462 sel[3 * i + nelt2] = 0;
4463 }
4464 if (!can_vec_perm_p (mode, false, sel))
4465 {
4466 if (dump_enabled_p ())
4467 dump_printf (MSG_MISSED_OPTIMIZATION,
4468 "permutaion op not supported by target.\n");
4469 return false;
4470 }
4471
4472 for (i = 0; i < nelt; i++)
4473 {
4474 if (3 * i + nelt0 < nelt)
4475 sel[3 * i + nelt0] = 3 * i + nelt0;
4476 if (3 * i + nelt1 < nelt)
4477 sel[3 * i + nelt1] = 3 * i + nelt1;
4478 if (3 * i + nelt2 < nelt)
4479 sel[3 * i + nelt2] = nelt + j2++;
4480 }
4481 if (!can_vec_perm_p (mode, false, sel))
4482 {
4483 if (dump_enabled_p ())
4484 dump_printf (MSG_MISSED_OPTIMIZATION,
4485 "permutaion op not supported by target.\n");
4486 return false;
4487 }
4488 }
4489 return true;
4490 }
4491 else
4492 {
4493 /* If length is not equal to 3 then only power of 2 is supported. */
4494 gcc_assert (exact_log2 (count) != -1);
4495
4496 for (i = 0; i < nelt / 2; i++)
4497 {
4498 sel[i * 2] = i;
4499 sel[i * 2 + 1] = i + nelt;
4500 }
4501 if (can_vec_perm_p (mode, false, sel))
4502 {
4503 for (i = 0; i < nelt; i++)
4504 sel[i] += nelt / 2;
4505 if (can_vec_perm_p (mode, false, sel))
4506 return true;
4507 }
4508 }
4509 }
4510
4511 if (dump_enabled_p ())
4512 dump_printf (MSG_MISSED_OPTIMIZATION,
4513 "permutaion op not supported by target.\n");
4514 return false;
4515 }
4516
4517
4518 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4519 type VECTYPE. */
4520
4521 bool
4522 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4523 {
4524 return vect_lanes_optab_supported_p ("vec_store_lanes",
4525 vec_store_lanes_optab,
4526 vectype, count);
4527 }
4528
4529
4530 /* Function vect_permute_store_chain.
4531
4532 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4533 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4534 the data correctly for the stores. Return the final references for stores
4535 in RESULT_CHAIN.
4536
4537 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4538 The input is 4 vectors each containing 8 elements. We assign a number to
4539 each element, the input sequence is:
4540
4541 1st vec: 0 1 2 3 4 5 6 7
4542 2nd vec: 8 9 10 11 12 13 14 15
4543 3rd vec: 16 17 18 19 20 21 22 23
4544 4th vec: 24 25 26 27 28 29 30 31
4545
4546 The output sequence should be:
4547
4548 1st vec: 0 8 16 24 1 9 17 25
4549 2nd vec: 2 10 18 26 3 11 19 27
4550 3rd vec: 4 12 20 28 5 13 21 30
4551 4th vec: 6 14 22 30 7 15 23 31
4552
4553 i.e., we interleave the contents of the four vectors in their order.
4554
4555 We use interleave_high/low instructions to create such output. The input of
4556 each interleave_high/low operation is two vectors:
4557 1st vec 2nd vec
4558 0 1 2 3 4 5 6 7
4559 the even elements of the result vector are obtained left-to-right from the
4560 high/low elements of the first vector. The odd elements of the result are
4561 obtained left-to-right from the high/low elements of the second vector.
4562 The output of interleave_high will be: 0 4 1 5
4563 and of interleave_low: 2 6 3 7
4564
4565
4566 The permutation is done in log LENGTH stages. In each stage interleave_high
4567 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4568 where the first argument is taken from the first half of DR_CHAIN and the
4569 second argument from it's second half.
4570 In our example,
4571
4572 I1: interleave_high (1st vec, 3rd vec)
4573 I2: interleave_low (1st vec, 3rd vec)
4574 I3: interleave_high (2nd vec, 4th vec)
4575 I4: interleave_low (2nd vec, 4th vec)
4576
4577 The output for the first stage is:
4578
4579 I1: 0 16 1 17 2 18 3 19
4580 I2: 4 20 5 21 6 22 7 23
4581 I3: 8 24 9 25 10 26 11 27
4582 I4: 12 28 13 29 14 30 15 31
4583
4584 The output of the second stage, i.e. the final result is:
4585
4586 I1: 0 8 16 24 1 9 17 25
4587 I2: 2 10 18 26 3 11 19 27
4588 I3: 4 12 20 28 5 13 21 30
4589 I4: 6 14 22 30 7 15 23 31. */
4590
4591 void
4592 vect_permute_store_chain (vec<tree> dr_chain,
4593 unsigned int length,
4594 gimple stmt,
4595 gimple_stmt_iterator *gsi,
4596 vec<tree> *result_chain)
4597 {
4598 tree vect1, vect2, high, low;
4599 gimple perm_stmt;
4600 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4601 tree perm_mask_low, perm_mask_high;
4602 tree data_ref;
4603 tree perm3_mask_low, perm3_mask_high;
4604 unsigned int i, n, log_length = exact_log2 (length);
4605 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4606 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4607
4608 result_chain->quick_grow (length);
4609 memcpy (result_chain->address (), dr_chain.address (),
4610 length * sizeof (tree));
4611
4612 if (length == 3)
4613 {
4614 unsigned int j0 = 0, j1 = 0, j2 = 0;
4615
4616 for (j = 0; j < 3; j++)
4617 {
4618 int nelt0 = ((3 - j) * nelt) % 3;
4619 int nelt1 = ((3 - j) * nelt + 1) % 3;
4620 int nelt2 = ((3 - j) * nelt + 2) % 3;
4621
4622 for (i = 0; i < nelt; i++)
4623 {
4624 if (3 * i + nelt0 < nelt)
4625 sel[3 * i + nelt0] = j0++;
4626 if (3 * i + nelt1 < nelt)
4627 sel[3 * i + nelt1] = nelt + j1++;
4628 if (3 * i + nelt2 < nelt)
4629 sel[3 * i + nelt2] = 0;
4630 }
4631 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4632
4633 for (i = 0; i < nelt; i++)
4634 {
4635 if (3 * i + nelt0 < nelt)
4636 sel[3 * i + nelt0] = 3 * i + nelt0;
4637 if (3 * i + nelt1 < nelt)
4638 sel[3 * i + nelt1] = 3 * i + nelt1;
4639 if (3 * i + nelt2 < nelt)
4640 sel[3 * i + nelt2] = nelt + j2++;
4641 }
4642 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4643
4644 vect1 = dr_chain[0];
4645 vect2 = dr_chain[1];
4646
4647 /* Create interleaving stmt:
4648 low = VEC_PERM_EXPR <vect1, vect2,
4649 {j, nelt, *, j + 1, nelt + j + 1, *,
4650 j + 2, nelt + j + 2, *, ...}> */
4651 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4652 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4653 vect2, perm3_mask_low);
4654 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4655
4656 vect1 = data_ref;
4657 vect2 = dr_chain[2];
4658 /* Create interleaving stmt:
4659 low = VEC_PERM_EXPR <vect1, vect2,
4660 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4661 6, 7, nelt + j + 2, ...}> */
4662 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4663 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4664 vect2, perm3_mask_high);
4665 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4666 (*result_chain)[j] = data_ref;
4667 }
4668 }
4669 else
4670 {
4671 /* If length is not equal to 3 then only power of 2 is supported. */
4672 gcc_assert (exact_log2 (length) != -1);
4673
4674 for (i = 0, n = nelt / 2; i < n; i++)
4675 {
4676 sel[i * 2] = i;
4677 sel[i * 2 + 1] = i + nelt;
4678 }
4679 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4680
4681 for (i = 0; i < nelt; i++)
4682 sel[i] += nelt / 2;
4683 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4684
4685 for (i = 0, n = log_length; i < n; i++)
4686 {
4687 for (j = 0; j < length/2; j++)
4688 {
4689 vect1 = dr_chain[j];
4690 vect2 = dr_chain[j+length/2];
4691
4692 /* Create interleaving stmt:
4693 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4694 ...}> */
4695 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4696 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4697 vect2, perm_mask_high);
4698 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4699 (*result_chain)[2*j] = high;
4700
4701 /* Create interleaving stmt:
4702 low = VEC_PERM_EXPR <vect1, vect2,
4703 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4704 ...}> */
4705 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4706 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4707 vect2, perm_mask_low);
4708 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4709 (*result_chain)[2*j+1] = low;
4710 }
4711 memcpy (dr_chain.address (), result_chain->address (),
4712 length * sizeof (tree));
4713 }
4714 }
4715 }
4716
4717 /* Function vect_setup_realignment
4718
4719 This function is called when vectorizing an unaligned load using
4720 the dr_explicit_realign[_optimized] scheme.
4721 This function generates the following code at the loop prolog:
4722
4723 p = initial_addr;
4724 x msq_init = *(floor(p)); # prolog load
4725 realignment_token = call target_builtin;
4726 loop:
4727 x msq = phi (msq_init, ---)
4728
4729 The stmts marked with x are generated only for the case of
4730 dr_explicit_realign_optimized.
4731
4732 The code above sets up a new (vector) pointer, pointing to the first
4733 location accessed by STMT, and a "floor-aligned" load using that pointer.
4734 It also generates code to compute the "realignment-token" (if the relevant
4735 target hook was defined), and creates a phi-node at the loop-header bb
4736 whose arguments are the result of the prolog-load (created by this
4737 function) and the result of a load that takes place in the loop (to be
4738 created by the caller to this function).
4739
4740 For the case of dr_explicit_realign_optimized:
4741 The caller to this function uses the phi-result (msq) to create the
4742 realignment code inside the loop, and sets up the missing phi argument,
4743 as follows:
4744 loop:
4745 msq = phi (msq_init, lsq)
4746 lsq = *(floor(p')); # load in loop
4747 result = realign_load (msq, lsq, realignment_token);
4748
4749 For the case of dr_explicit_realign:
4750 loop:
4751 msq = *(floor(p)); # load in loop
4752 p' = p + (VS-1);
4753 lsq = *(floor(p')); # load in loop
4754 result = realign_load (msq, lsq, realignment_token);
4755
4756 Input:
4757 STMT - (scalar) load stmt to be vectorized. This load accesses
4758 a memory location that may be unaligned.
4759 BSI - place where new code is to be inserted.
4760 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4761 is used.
4762
4763 Output:
4764 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4765 target hook, if defined.
4766 Return value - the result of the loop-header phi node. */
4767
4768 tree
4769 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4770 tree *realignment_token,
4771 enum dr_alignment_support alignment_support_scheme,
4772 tree init_addr,
4773 struct loop **at_loop)
4774 {
4775 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4776 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4777 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4778 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4779 struct loop *loop = NULL;
4780 edge pe = NULL;
4781 tree scalar_dest = gimple_assign_lhs (stmt);
4782 tree vec_dest;
4783 gimple inc;
4784 tree ptr;
4785 tree data_ref;
4786 basic_block new_bb;
4787 tree msq_init = NULL_TREE;
4788 tree new_temp;
4789 gphi *phi_stmt;
4790 tree msq = NULL_TREE;
4791 gimple_seq stmts = NULL;
4792 bool inv_p;
4793 bool compute_in_loop = false;
4794 bool nested_in_vect_loop = false;
4795 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4796 struct loop *loop_for_initial_load = NULL;
4797
4798 if (loop_vinfo)
4799 {
4800 loop = LOOP_VINFO_LOOP (loop_vinfo);
4801 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4802 }
4803
4804 gcc_assert (alignment_support_scheme == dr_explicit_realign
4805 || alignment_support_scheme == dr_explicit_realign_optimized);
4806
4807 /* We need to generate three things:
4808 1. the misalignment computation
4809 2. the extra vector load (for the optimized realignment scheme).
4810 3. the phi node for the two vectors from which the realignment is
4811 done (for the optimized realignment scheme). */
4812
4813 /* 1. Determine where to generate the misalignment computation.
4814
4815 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4816 calculation will be generated by this function, outside the loop (in the
4817 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4818 caller, inside the loop.
4819
4820 Background: If the misalignment remains fixed throughout the iterations of
4821 the loop, then both realignment schemes are applicable, and also the
4822 misalignment computation can be done outside LOOP. This is because we are
4823 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4824 are a multiple of VS (the Vector Size), and therefore the misalignment in
4825 different vectorized LOOP iterations is always the same.
4826 The problem arises only if the memory access is in an inner-loop nested
4827 inside LOOP, which is now being vectorized using outer-loop vectorization.
4828 This is the only case when the misalignment of the memory access may not
4829 remain fixed throughout the iterations of the inner-loop (as explained in
4830 detail in vect_supportable_dr_alignment). In this case, not only is the
4831 optimized realignment scheme not applicable, but also the misalignment
4832 computation (and generation of the realignment token that is passed to
4833 REALIGN_LOAD) have to be done inside the loop.
4834
4835 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4836 or not, which in turn determines if the misalignment is computed inside
4837 the inner-loop, or outside LOOP. */
4838
4839 if (init_addr != NULL_TREE || !loop_vinfo)
4840 {
4841 compute_in_loop = true;
4842 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4843 }
4844
4845
4846 /* 2. Determine where to generate the extra vector load.
4847
4848 For the optimized realignment scheme, instead of generating two vector
4849 loads in each iteration, we generate a single extra vector load in the
4850 preheader of the loop, and in each iteration reuse the result of the
4851 vector load from the previous iteration. In case the memory access is in
4852 an inner-loop nested inside LOOP, which is now being vectorized using
4853 outer-loop vectorization, we need to determine whether this initial vector
4854 load should be generated at the preheader of the inner-loop, or can be
4855 generated at the preheader of LOOP. If the memory access has no evolution
4856 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4857 to be generated inside LOOP (in the preheader of the inner-loop). */
4858
4859 if (nested_in_vect_loop)
4860 {
4861 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4862 bool invariant_in_outerloop =
4863 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4864 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4865 }
4866 else
4867 loop_for_initial_load = loop;
4868 if (at_loop)
4869 *at_loop = loop_for_initial_load;
4870
4871 if (loop_for_initial_load)
4872 pe = loop_preheader_edge (loop_for_initial_load);
4873
4874 /* 3. For the case of the optimized realignment, create the first vector
4875 load at the loop preheader. */
4876
4877 if (alignment_support_scheme == dr_explicit_realign_optimized)
4878 {
4879 /* Create msq_init = *(floor(p1)) in the loop preheader */
4880 gassign *new_stmt;
4881
4882 gcc_assert (!compute_in_loop);
4883 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4884 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4885 NULL_TREE, &init_addr, NULL, &inc,
4886 true, &inv_p);
4887 new_temp = copy_ssa_name (ptr);
4888 new_stmt = gimple_build_assign
4889 (new_temp, BIT_AND_EXPR, ptr,
4890 build_int_cst (TREE_TYPE (ptr),
4891 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4892 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4893 gcc_assert (!new_bb);
4894 data_ref
4895 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4896 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4897 new_stmt = gimple_build_assign (vec_dest, data_ref);
4898 new_temp = make_ssa_name (vec_dest, new_stmt);
4899 gimple_assign_set_lhs (new_stmt, new_temp);
4900 if (pe)
4901 {
4902 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4903 gcc_assert (!new_bb);
4904 }
4905 else
4906 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4907
4908 msq_init = gimple_assign_lhs (new_stmt);
4909 }
4910
4911 /* 4. Create realignment token using a target builtin, if available.
4912 It is done either inside the containing loop, or before LOOP (as
4913 determined above). */
4914
4915 if (targetm.vectorize.builtin_mask_for_load)
4916 {
4917 gcall *new_stmt;
4918 tree builtin_decl;
4919
4920 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4921 if (!init_addr)
4922 {
4923 /* Generate the INIT_ADDR computation outside LOOP. */
4924 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4925 NULL_TREE, loop);
4926 if (loop)
4927 {
4928 pe = loop_preheader_edge (loop);
4929 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4930 gcc_assert (!new_bb);
4931 }
4932 else
4933 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4934 }
4935
4936 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4937 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4938 vec_dest =
4939 vect_create_destination_var (scalar_dest,
4940 gimple_call_return_type (new_stmt));
4941 new_temp = make_ssa_name (vec_dest, new_stmt);
4942 gimple_call_set_lhs (new_stmt, new_temp);
4943
4944 if (compute_in_loop)
4945 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4946 else
4947 {
4948 /* Generate the misalignment computation outside LOOP. */
4949 pe = loop_preheader_edge (loop);
4950 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4951 gcc_assert (!new_bb);
4952 }
4953
4954 *realignment_token = gimple_call_lhs (new_stmt);
4955
4956 /* The result of the CALL_EXPR to this builtin is determined from
4957 the value of the parameter and no global variables are touched
4958 which makes the builtin a "const" function. Requiring the
4959 builtin to have the "const" attribute makes it unnecessary
4960 to call mark_call_clobbered. */
4961 gcc_assert (TREE_READONLY (builtin_decl));
4962 }
4963
4964 if (alignment_support_scheme == dr_explicit_realign)
4965 return msq;
4966
4967 gcc_assert (!compute_in_loop);
4968 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4969
4970
4971 /* 5. Create msq = phi <msq_init, lsq> in loop */
4972
4973 pe = loop_preheader_edge (containing_loop);
4974 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4975 msq = make_ssa_name (vec_dest);
4976 phi_stmt = create_phi_node (msq, containing_loop->header);
4977 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4978
4979 return msq;
4980 }
4981
4982
4983 /* Function vect_grouped_load_supported.
4984
4985 Returns TRUE if even and odd permutations are supported,
4986 and FALSE otherwise. */
4987
4988 bool
4989 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4990 {
4991 machine_mode mode = TYPE_MODE (vectype);
4992
4993 /* vect_permute_load_chain requires the group size to be equal to 3 or
4994 be a power of two. */
4995 if (count != 3 && exact_log2 (count) == -1)
4996 {
4997 if (dump_enabled_p ())
4998 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999 "the size of the group of accesses"
5000 " is not a power of 2 or not equal to 3\n");
5001 return false;
5002 }
5003
5004 /* Check that the permutation is supported. */
5005 if (VECTOR_MODE_P (mode))
5006 {
5007 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5008 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5009
5010 if (count == 3)
5011 {
5012 unsigned int k;
5013 for (k = 0; k < 3; k++)
5014 {
5015 for (i = 0; i < nelt; i++)
5016 if (3 * i + k < 2 * nelt)
5017 sel[i] = 3 * i + k;
5018 else
5019 sel[i] = 0;
5020 if (!can_vec_perm_p (mode, false, sel))
5021 {
5022 if (dump_enabled_p ())
5023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5024 "shuffle of 3 loads is not supported by"
5025 " target\n");
5026 return false;
5027 }
5028 for (i = 0, j = 0; i < nelt; i++)
5029 if (3 * i + k < 2 * nelt)
5030 sel[i] = i;
5031 else
5032 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5033 if (!can_vec_perm_p (mode, false, sel))
5034 {
5035 if (dump_enabled_p ())
5036 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5037 "shuffle of 3 loads is not supported by"
5038 " target\n");
5039 return false;
5040 }
5041 }
5042 return true;
5043 }
5044 else
5045 {
5046 /* If length is not equal to 3 then only power of 2 is supported. */
5047 gcc_assert (exact_log2 (count) != -1);
5048 for (i = 0; i < nelt; i++)
5049 sel[i] = i * 2;
5050 if (can_vec_perm_p (mode, false, sel))
5051 {
5052 for (i = 0; i < nelt; i++)
5053 sel[i] = i * 2 + 1;
5054 if (can_vec_perm_p (mode, false, sel))
5055 return true;
5056 }
5057 }
5058 }
5059
5060 if (dump_enabled_p ())
5061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5062 "extract even/odd not supported by target\n");
5063 return false;
5064 }
5065
5066 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5067 type VECTYPE. */
5068
5069 bool
5070 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5071 {
5072 return vect_lanes_optab_supported_p ("vec_load_lanes",
5073 vec_load_lanes_optab,
5074 vectype, count);
5075 }
5076
5077 /* Function vect_permute_load_chain.
5078
5079 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5080 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5081 the input data correctly. Return the final references for loads in
5082 RESULT_CHAIN.
5083
5084 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5085 The input is 4 vectors each containing 8 elements. We assign a number to each
5086 element, the input sequence is:
5087
5088 1st vec: 0 1 2 3 4 5 6 7
5089 2nd vec: 8 9 10 11 12 13 14 15
5090 3rd vec: 16 17 18 19 20 21 22 23
5091 4th vec: 24 25 26 27 28 29 30 31
5092
5093 The output sequence should be:
5094
5095 1st vec: 0 4 8 12 16 20 24 28
5096 2nd vec: 1 5 9 13 17 21 25 29
5097 3rd vec: 2 6 10 14 18 22 26 30
5098 4th vec: 3 7 11 15 19 23 27 31
5099
5100 i.e., the first output vector should contain the first elements of each
5101 interleaving group, etc.
5102
5103 We use extract_even/odd instructions to create such output. The input of
5104 each extract_even/odd operation is two vectors
5105 1st vec 2nd vec
5106 0 1 2 3 4 5 6 7
5107
5108 and the output is the vector of extracted even/odd elements. The output of
5109 extract_even will be: 0 2 4 6
5110 and of extract_odd: 1 3 5 7
5111
5112
5113 The permutation is done in log LENGTH stages. In each stage extract_even
5114 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5115 their order. In our example,
5116
5117 E1: extract_even (1st vec, 2nd vec)
5118 E2: extract_odd (1st vec, 2nd vec)
5119 E3: extract_even (3rd vec, 4th vec)
5120 E4: extract_odd (3rd vec, 4th vec)
5121
5122 The output for the first stage will be:
5123
5124 E1: 0 2 4 6 8 10 12 14
5125 E2: 1 3 5 7 9 11 13 15
5126 E3: 16 18 20 22 24 26 28 30
5127 E4: 17 19 21 23 25 27 29 31
5128
5129 In order to proceed and create the correct sequence for the next stage (or
5130 for the correct output, if the second stage is the last one, as in our
5131 example), we first put the output of extract_even operation and then the
5132 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5133 The input for the second stage is:
5134
5135 1st vec (E1): 0 2 4 6 8 10 12 14
5136 2nd vec (E3): 16 18 20 22 24 26 28 30
5137 3rd vec (E2): 1 3 5 7 9 11 13 15
5138 4th vec (E4): 17 19 21 23 25 27 29 31
5139
5140 The output of the second stage:
5141
5142 E1: 0 4 8 12 16 20 24 28
5143 E2: 2 6 10 14 18 22 26 30
5144 E3: 1 5 9 13 17 21 25 29
5145 E4: 3 7 11 15 19 23 27 31
5146
5147 And RESULT_CHAIN after reordering:
5148
5149 1st vec (E1): 0 4 8 12 16 20 24 28
5150 2nd vec (E3): 1 5 9 13 17 21 25 29
5151 3rd vec (E2): 2 6 10 14 18 22 26 30
5152 4th vec (E4): 3 7 11 15 19 23 27 31. */
5153
5154 static void
5155 vect_permute_load_chain (vec<tree> dr_chain,
5156 unsigned int length,
5157 gimple stmt,
5158 gimple_stmt_iterator *gsi,
5159 vec<tree> *result_chain)
5160 {
5161 tree data_ref, first_vect, second_vect;
5162 tree perm_mask_even, perm_mask_odd;
5163 tree perm3_mask_low, perm3_mask_high;
5164 gimple perm_stmt;
5165 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5166 unsigned int i, j, log_length = exact_log2 (length);
5167 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5168 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5169
5170 result_chain->quick_grow (length);
5171 memcpy (result_chain->address (), dr_chain.address (),
5172 length * sizeof (tree));
5173
5174 if (length == 3)
5175 {
5176 unsigned int k;
5177
5178 for (k = 0; k < 3; k++)
5179 {
5180 for (i = 0; i < nelt; i++)
5181 if (3 * i + k < 2 * nelt)
5182 sel[i] = 3 * i + k;
5183 else
5184 sel[i] = 0;
5185 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5186
5187 for (i = 0, j = 0; i < nelt; i++)
5188 if (3 * i + k < 2 * nelt)
5189 sel[i] = i;
5190 else
5191 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5192
5193 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5194
5195 first_vect = dr_chain[0];
5196 second_vect = dr_chain[1];
5197
5198 /* Create interleaving stmt (low part of):
5199 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5200 ...}> */
5201 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5202 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5203 second_vect, perm3_mask_low);
5204 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5205
5206 /* Create interleaving stmt (high part of):
5207 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5208 ...}> */
5209 first_vect = data_ref;
5210 second_vect = dr_chain[2];
5211 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5212 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5213 second_vect, perm3_mask_high);
5214 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5215 (*result_chain)[k] = data_ref;
5216 }
5217 }
5218 else
5219 {
5220 /* If length is not equal to 3 then only power of 2 is supported. */
5221 gcc_assert (exact_log2 (length) != -1);
5222
5223 for (i = 0; i < nelt; ++i)
5224 sel[i] = i * 2;
5225 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5226
5227 for (i = 0; i < nelt; ++i)
5228 sel[i] = i * 2 + 1;
5229 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5230
5231 for (i = 0; i < log_length; i++)
5232 {
5233 for (j = 0; j < length; j += 2)
5234 {
5235 first_vect = dr_chain[j];
5236 second_vect = dr_chain[j+1];
5237
5238 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5239 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5240 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5241 first_vect, second_vect,
5242 perm_mask_even);
5243 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5244 (*result_chain)[j/2] = data_ref;
5245
5246 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5247 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5248 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5249 first_vect, second_vect,
5250 perm_mask_odd);
5251 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5252 (*result_chain)[j/2+length/2] = data_ref;
5253 }
5254 memcpy (dr_chain.address (), result_chain->address (),
5255 length * sizeof (tree));
5256 }
5257 }
5258 }
5259
5260 /* Function vect_shift_permute_load_chain.
5261
5262 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5263 sequence of stmts to reorder the input data accordingly.
5264 Return the final references for loads in RESULT_CHAIN.
5265 Return true if successed, false otherwise.
5266
5267 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5268 The input is 3 vectors each containing 8 elements. We assign a
5269 number to each element, the input sequence is:
5270
5271 1st vec: 0 1 2 3 4 5 6 7
5272 2nd vec: 8 9 10 11 12 13 14 15
5273 3rd vec: 16 17 18 19 20 21 22 23
5274
5275 The output sequence should be:
5276
5277 1st vec: 0 3 6 9 12 15 18 21
5278 2nd vec: 1 4 7 10 13 16 19 22
5279 3rd vec: 2 5 8 11 14 17 20 23
5280
5281 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5282
5283 First we shuffle all 3 vectors to get correct elements order:
5284
5285 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5286 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5287 3rd vec: (16 19 22) (17 20 23) (18 21)
5288
5289 Next we unite and shift vector 3 times:
5290
5291 1st step:
5292 shift right by 6 the concatenation of:
5293 "1st vec" and "2nd vec"
5294 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5295 "2nd vec" and "3rd vec"
5296 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5297 "3rd vec" and "1st vec"
5298 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5299 | New vectors |
5300
5301 So that now new vectors are:
5302
5303 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5304 2nd vec: (10 13) (16 19 22) (17 20 23)
5305 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5306
5307 2nd step:
5308 shift right by 5 the concatenation of:
5309 "1st vec" and "3rd vec"
5310 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5311 "2nd vec" and "1st vec"
5312 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5313 "3rd vec" and "2nd vec"
5314 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5315 | New vectors |
5316
5317 So that now new vectors are:
5318
5319 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5320 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5321 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5322
5323 3rd step:
5324 shift right by 5 the concatenation of:
5325 "1st vec" and "1st vec"
5326 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5327 shift right by 3 the concatenation of:
5328 "2nd vec" and "2nd vec"
5329 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5330 | New vectors |
5331
5332 So that now all vectors are READY:
5333 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5334 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5335 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5336
5337 This algorithm is faster than one in vect_permute_load_chain if:
5338 1. "shift of a concatination" is faster than general permutation.
5339 This is usually so.
5340 2. The TARGET machine can't execute vector instructions in parallel.
5341 This is because each step of the algorithm depends on previous.
5342 The algorithm in vect_permute_load_chain is much more parallel.
5343
5344 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5345 */
5346
5347 static bool
5348 vect_shift_permute_load_chain (vec<tree> dr_chain,
5349 unsigned int length,
5350 gimple stmt,
5351 gimple_stmt_iterator *gsi,
5352 vec<tree> *result_chain)
5353 {
5354 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5355 tree perm2_mask1, perm2_mask2, perm3_mask;
5356 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5357 gimple perm_stmt;
5358
5359 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5360 unsigned int i;
5361 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5362 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5363 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5365
5366 result_chain->quick_grow (length);
5367 memcpy (result_chain->address (), dr_chain.address (),
5368 length * sizeof (tree));
5369
5370 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5371 {
5372 unsigned int j, log_length = exact_log2 (length);
5373 for (i = 0; i < nelt / 2; ++i)
5374 sel[i] = i * 2;
5375 for (i = 0; i < nelt / 2; ++i)
5376 sel[nelt / 2 + i] = i * 2 + 1;
5377 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5378 {
5379 if (dump_enabled_p ())
5380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5381 "shuffle of 2 fields structure is not \
5382 supported by target\n");
5383 return false;
5384 }
5385 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5386
5387 for (i = 0; i < nelt / 2; ++i)
5388 sel[i] = i * 2 + 1;
5389 for (i = 0; i < nelt / 2; ++i)
5390 sel[nelt / 2 + i] = i * 2;
5391 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5392 {
5393 if (dump_enabled_p ())
5394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5395 "shuffle of 2 fields structure is not \
5396 supported by target\n");
5397 return false;
5398 }
5399 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5400
5401 /* Generating permutation constant to shift all elements.
5402 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5403 for (i = 0; i < nelt; i++)
5404 sel[i] = nelt / 2 + i;
5405 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5406 {
5407 if (dump_enabled_p ())
5408 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5409 "shift permutation is not supported by target\n");
5410 return false;
5411 }
5412 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5413
5414 /* Generating permutation constant to select vector from 2.
5415 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5416 for (i = 0; i < nelt / 2; i++)
5417 sel[i] = i;
5418 for (i = nelt / 2; i < nelt; i++)
5419 sel[i] = nelt + i;
5420 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5421 {
5422 if (dump_enabled_p ())
5423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5424 "select is not supported by target\n");
5425 return false;
5426 }
5427 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5428
5429 for (i = 0; i < log_length; i++)
5430 {
5431 for (j = 0; j < length; j += 2)
5432 {
5433 first_vect = dr_chain[j];
5434 second_vect = dr_chain[j + 1];
5435
5436 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5437 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5438 first_vect, first_vect,
5439 perm2_mask1);
5440 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5441 vect[0] = data_ref;
5442
5443 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5444 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5445 second_vect, second_vect,
5446 perm2_mask2);
5447 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5448 vect[1] = data_ref;
5449
5450 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5451 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5452 vect[0], vect[1], shift1_mask);
5453 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5454 (*result_chain)[j/2 + length/2] = data_ref;
5455
5456 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5457 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5458 vect[0], vect[1], select_mask);
5459 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5460 (*result_chain)[j/2] = data_ref;
5461 }
5462 memcpy (dr_chain.address (), result_chain->address (),
5463 length * sizeof (tree));
5464 }
5465 return true;
5466 }
5467 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5468 {
5469 unsigned int k = 0, l = 0;
5470
5471 /* Generating permutation constant to get all elements in rigth order.
5472 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5473 for (i = 0; i < nelt; i++)
5474 {
5475 if (3 * k + (l % 3) >= nelt)
5476 {
5477 k = 0;
5478 l += (3 - (nelt % 3));
5479 }
5480 sel[i] = 3 * k + (l % 3);
5481 k++;
5482 }
5483 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5484 {
5485 if (dump_enabled_p ())
5486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487 "shuffle of 3 fields structure is not \
5488 supported by target\n");
5489 return false;
5490 }
5491 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5492
5493 /* Generating permutation constant to shift all elements.
5494 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5495 for (i = 0; i < nelt; i++)
5496 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5497 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5498 {
5499 if (dump_enabled_p ())
5500 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5501 "shift permutation is not supported by target\n");
5502 return false;
5503 }
5504 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5505
5506 /* Generating permutation constant to shift all elements.
5507 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5508 for (i = 0; i < nelt; i++)
5509 sel[i] = 2 * (nelt / 3) + 1 + i;
5510 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5511 {
5512 if (dump_enabled_p ())
5513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5514 "shift permutation is not supported by target\n");
5515 return false;
5516 }
5517 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5518
5519 /* Generating permutation constant to shift all elements.
5520 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5521 for (i = 0; i < nelt; i++)
5522 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5523 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5524 {
5525 if (dump_enabled_p ())
5526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5527 "shift permutation is not supported by target\n");
5528 return false;
5529 }
5530 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5531
5532 /* Generating permutation constant to shift all elements.
5533 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5534 for (i = 0; i < nelt; i++)
5535 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5536 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5537 {
5538 if (dump_enabled_p ())
5539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5540 "shift permutation is not supported by target\n");
5541 return false;
5542 }
5543 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5544
5545 for (k = 0; k < 3; k++)
5546 {
5547 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5548 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5549 dr_chain[k], dr_chain[k],
5550 perm3_mask);
5551 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5552 vect[k] = data_ref;
5553 }
5554
5555 for (k = 0; k < 3; k++)
5556 {
5557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5559 vect[k % 3], vect[(k + 1) % 3],
5560 shift1_mask);
5561 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5562 vect_shift[k] = data_ref;
5563 }
5564
5565 for (k = 0; k < 3; k++)
5566 {
5567 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5568 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5569 vect_shift[(4 - k) % 3],
5570 vect_shift[(3 - k) % 3],
5571 shift2_mask);
5572 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5573 vect[k] = data_ref;
5574 }
5575
5576 (*result_chain)[3 - (nelt % 3)] = vect[2];
5577
5578 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5579 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5580 vect[0], shift3_mask);
5581 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5582 (*result_chain)[nelt % 3] = data_ref;
5583
5584 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5585 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5586 vect[1], shift4_mask);
5587 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5588 (*result_chain)[0] = data_ref;
5589 return true;
5590 }
5591 return false;
5592 }
5593
5594 /* Function vect_transform_grouped_load.
5595
5596 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5597 to perform their permutation and ascribe the result vectorized statements to
5598 the scalar statements.
5599 */
5600
5601 void
5602 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5603 gimple_stmt_iterator *gsi)
5604 {
5605 machine_mode mode;
5606 vec<tree> result_chain = vNULL;
5607
5608 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5609 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5610 vectors, that are ready for vector computation. */
5611 result_chain.create (size);
5612
5613 /* If reassociation width for vector type is 2 or greater target machine can
5614 execute 2 or more vector instructions in parallel. Otherwise try to
5615 get chain for loads group using vect_shift_permute_load_chain. */
5616 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5617 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5618 || exact_log2 (size) != -1
5619 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5620 gsi, &result_chain))
5621 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5622 vect_record_grouped_load_vectors (stmt, result_chain);
5623 result_chain.release ();
5624 }
5625
5626 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5627 generated as part of the vectorization of STMT. Assign the statement
5628 for each vector to the associated scalar statement. */
5629
5630 void
5631 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5632 {
5633 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5634 gimple next_stmt, new_stmt;
5635 unsigned int i, gap_count;
5636 tree tmp_data_ref;
5637
5638 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5639 Since we scan the chain starting from it's first node, their order
5640 corresponds the order of data-refs in RESULT_CHAIN. */
5641 next_stmt = first_stmt;
5642 gap_count = 1;
5643 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5644 {
5645 if (!next_stmt)
5646 break;
5647
5648 /* Skip the gaps. Loads created for the gaps will be removed by dead
5649 code elimination pass later. No need to check for the first stmt in
5650 the group, since it always exists.
5651 GROUP_GAP is the number of steps in elements from the previous
5652 access (if there is no gap GROUP_GAP is 1). We skip loads that
5653 correspond to the gaps. */
5654 if (next_stmt != first_stmt
5655 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5656 {
5657 gap_count++;
5658 continue;
5659 }
5660
5661 while (next_stmt)
5662 {
5663 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5664 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5665 copies, and we put the new vector statement in the first available
5666 RELATED_STMT. */
5667 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5668 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5669 else
5670 {
5671 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5672 {
5673 gimple prev_stmt =
5674 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5675 gimple rel_stmt =
5676 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5677 while (rel_stmt)
5678 {
5679 prev_stmt = rel_stmt;
5680 rel_stmt =
5681 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5682 }
5683
5684 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5685 new_stmt;
5686 }
5687 }
5688
5689 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5690 gap_count = 1;
5691 /* If NEXT_STMT accesses the same DR as the previous statement,
5692 put the same TMP_DATA_REF as its vectorized statement; otherwise
5693 get the next data-ref from RESULT_CHAIN. */
5694 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5695 break;
5696 }
5697 }
5698 }
5699
5700 /* Function vect_force_dr_alignment_p.
5701
5702 Returns whether the alignment of a DECL can be forced to be aligned
5703 on ALIGNMENT bit boundary. */
5704
5705 bool
5706 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5707 {
5708 if (TREE_CODE (decl) != VAR_DECL)
5709 return false;
5710
5711 if (decl_in_symtab_p (decl)
5712 && !symtab_node::get (decl)->can_increase_alignment_p ())
5713 return false;
5714
5715 if (TREE_STATIC (decl))
5716 return (alignment <= MAX_OFILE_ALIGNMENT);
5717 else
5718 return (alignment <= MAX_STACK_ALIGNMENT);
5719 }
5720
5721
5722 /* Return whether the data reference DR is supported with respect to its
5723 alignment.
5724 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5725 it is aligned, i.e., check if it is possible to vectorize it with different
5726 alignment. */
5727
5728 enum dr_alignment_support
5729 vect_supportable_dr_alignment (struct data_reference *dr,
5730 bool check_aligned_accesses)
5731 {
5732 gimple stmt = DR_STMT (dr);
5733 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5734 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5735 machine_mode mode = TYPE_MODE (vectype);
5736 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5737 struct loop *vect_loop = NULL;
5738 bool nested_in_vect_loop = false;
5739
5740 if (aligned_access_p (dr) && !check_aligned_accesses)
5741 return dr_aligned;
5742
5743 /* For now assume all conditional loads/stores support unaligned
5744 access without any special code. */
5745 if (is_gimple_call (stmt)
5746 && gimple_call_internal_p (stmt)
5747 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5748 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5749 return dr_unaligned_supported;
5750
5751 if (loop_vinfo)
5752 {
5753 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5754 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5755 }
5756
5757 /* Possibly unaligned access. */
5758
5759 /* We can choose between using the implicit realignment scheme (generating
5760 a misaligned_move stmt) and the explicit realignment scheme (generating
5761 aligned loads with a REALIGN_LOAD). There are two variants to the
5762 explicit realignment scheme: optimized, and unoptimized.
5763 We can optimize the realignment only if the step between consecutive
5764 vector loads is equal to the vector size. Since the vector memory
5765 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5766 is guaranteed that the misalignment amount remains the same throughout the
5767 execution of the vectorized loop. Therefore, we can create the
5768 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5769 at the loop preheader.
5770
5771 However, in the case of outer-loop vectorization, when vectorizing a
5772 memory access in the inner-loop nested within the LOOP that is now being
5773 vectorized, while it is guaranteed that the misalignment of the
5774 vectorized memory access will remain the same in different outer-loop
5775 iterations, it is *not* guaranteed that is will remain the same throughout
5776 the execution of the inner-loop. This is because the inner-loop advances
5777 with the original scalar step (and not in steps of VS). If the inner-loop
5778 step happens to be a multiple of VS, then the misalignment remains fixed
5779 and we can use the optimized realignment scheme. For example:
5780
5781 for (i=0; i<N; i++)
5782 for (j=0; j<M; j++)
5783 s += a[i+j];
5784
5785 When vectorizing the i-loop in the above example, the step between
5786 consecutive vector loads is 1, and so the misalignment does not remain
5787 fixed across the execution of the inner-loop, and the realignment cannot
5788 be optimized (as illustrated in the following pseudo vectorized loop):
5789
5790 for (i=0; i<N; i+=4)
5791 for (j=0; j<M; j++){
5792 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5793 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5794 // (assuming that we start from an aligned address).
5795 }
5796
5797 We therefore have to use the unoptimized realignment scheme:
5798
5799 for (i=0; i<N; i+=4)
5800 for (j=k; j<M; j+=4)
5801 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5802 // that the misalignment of the initial address is
5803 // 0).
5804
5805 The loop can then be vectorized as follows:
5806
5807 for (k=0; k<4; k++){
5808 rt = get_realignment_token (&vp[k]);
5809 for (i=0; i<N; i+=4){
5810 v1 = vp[i+k];
5811 for (j=k; j<M; j+=4){
5812 v2 = vp[i+j+VS-1];
5813 va = REALIGN_LOAD <v1,v2,rt>;
5814 vs += va;
5815 v1 = v2;
5816 }
5817 }
5818 } */
5819
5820 if (DR_IS_READ (dr))
5821 {
5822 bool is_packed = false;
5823 tree type = (TREE_TYPE (DR_REF (dr)));
5824
5825 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5826 && (!targetm.vectorize.builtin_mask_for_load
5827 || targetm.vectorize.builtin_mask_for_load ()))
5828 {
5829 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5830 if ((nested_in_vect_loop
5831 && (TREE_INT_CST_LOW (DR_STEP (dr))
5832 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5833 || !loop_vinfo)
5834 return dr_explicit_realign;
5835 else
5836 return dr_explicit_realign_optimized;
5837 }
5838 if (!known_alignment_for_access_p (dr))
5839 is_packed = not_size_aligned (DR_REF (dr));
5840
5841 if ((TYPE_USER_ALIGN (type) && !is_packed)
5842 || targetm.vectorize.
5843 support_vector_misalignment (mode, type,
5844 DR_MISALIGNMENT (dr), is_packed))
5845 /* Can't software pipeline the loads, but can at least do them. */
5846 return dr_unaligned_supported;
5847 }
5848 else
5849 {
5850 bool is_packed = false;
5851 tree type = (TREE_TYPE (DR_REF (dr)));
5852
5853 if (!known_alignment_for_access_p (dr))
5854 is_packed = not_size_aligned (DR_REF (dr));
5855
5856 if ((TYPE_USER_ALIGN (type) && !is_packed)
5857 || targetm.vectorize.
5858 support_vector_misalignment (mode, type,
5859 DR_MISALIGNMENT (dr), is_packed))
5860 return dr_unaligned_supported;
5861 }
5862
5863 /* Unsupported. */
5864 return dr_unaligned_unsupported;
5865 }