* config/nvptx/nvptx.md (call_operation): Remove unused variables.
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "alias.h"
28 #include "symtab.h"
29 #include "tree.h"
30 #include "fold-const.h"
31 #include "stor-layout.h"
32 #include "tm_p.h"
33 #include "target.h"
34 #include "predict.h"
35 #include "hard-reg-set.h"
36 #include "function.h"
37 #include "dominance.h"
38 #include "cfg.h"
39 #include "basic-block.h"
40 #include "gimple-pretty-print.h"
41 #include "tree-ssa-alias.h"
42 #include "internal-fn.h"
43 #include "tree-eh.h"
44 #include "gimple-expr.h"
45 #include "gimple.h"
46 #include "gimplify.h"
47 #include "gimple-iterator.h"
48 #include "gimplify-me.h"
49 #include "gimple-ssa.h"
50 #include "tree-phinodes.h"
51 #include "ssa-iterators.h"
52 #include "stringpool.h"
53 #include "tree-ssanames.h"
54 #include "tree-ssa-loop-ivopts.h"
55 #include "tree-ssa-loop-manip.h"
56 #include "tree-ssa-loop.h"
57 #include "cfgloop.h"
58 #include "tree-chrec.h"
59 #include "tree-scalar-evolution.h"
60 #include "tree-vectorizer.h"
61 #include "diagnostic-core.h"
62 #include "cgraph.h"
63 /* Need to include rtl.h, expr.h, etc. for optabs. */
64 #include "rtl.h"
65 #include "flags.h"
66 #include "insn-config.h"
67 #include "expmed.h"
68 #include "dojump.h"
69 #include "explow.h"
70 #include "calls.h"
71 #include "emit-rtl.h"
72 #include "varasm.h"
73 #include "stmt.h"
74 #include "expr.h"
75 #include "insn-codes.h"
76 #include "optabs.h"
77 #include "builtins.h"
78
79 /* Return true if load- or store-lanes optab OPTAB is implemented for
80 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
81
82 static bool
83 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
84 tree vectype, unsigned HOST_WIDE_INT count)
85 {
86 machine_mode mode, array_mode;
87 bool limit_p;
88
89 mode = TYPE_MODE (vectype);
90 limit_p = !targetm.array_mode_supported_p (mode, count);
91 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
92 MODE_INT, limit_p);
93
94 if (array_mode == BLKmode)
95 {
96 if (dump_enabled_p ())
97 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
98 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
99 GET_MODE_NAME (mode), count);
100 return false;
101 }
102
103 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
104 {
105 if (dump_enabled_p ())
106 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
107 "cannot use %s<%s><%s>\n", name,
108 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
109 return false;
110 }
111
112 if (dump_enabled_p ())
113 dump_printf_loc (MSG_NOTE, vect_location,
114 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
115 GET_MODE_NAME (mode));
116
117 return true;
118 }
119
120
121 /* Return the smallest scalar part of STMT.
122 This is used to determine the vectype of the stmt. We generally set the
123 vectype according to the type of the result (lhs). For stmts whose
124 result-type is different than the type of the arguments (e.g., demotion,
125 promotion), vectype will be reset appropriately (later). Note that we have
126 to visit the smallest datatype in this function, because that determines the
127 VF. If the smallest datatype in the loop is present only as the rhs of a
128 promotion operation - we'd miss it.
129 Such a case, where a variable of this datatype does not appear in the lhs
130 anywhere in the loop, can only occur if it's an invariant: e.g.:
131 'int_x = (int) short_inv', which we'd expect to have been optimized away by
132 invariant motion. However, we cannot rely on invariant motion to always
133 take invariants out of the loop, and so in the case of promotion we also
134 have to check the rhs.
135 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
136 types. */
137
138 tree
139 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
140 HOST_WIDE_INT *rhs_size_unit)
141 {
142 tree scalar_type = gimple_expr_type (stmt);
143 HOST_WIDE_INT lhs, rhs;
144
145 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
146
147 if (is_gimple_assign (stmt)
148 && (gimple_assign_cast_p (stmt)
149 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
150 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
151 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
152 {
153 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
154
155 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
156 if (rhs < lhs)
157 scalar_type = rhs_type;
158 }
159
160 *lhs_size_unit = lhs;
161 *rhs_size_unit = rhs;
162 return scalar_type;
163 }
164
165
166 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
167 tested at run-time. Return TRUE if DDR was successfully inserted.
168 Return false if versioning is not supported. */
169
170 static bool
171 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
172 {
173 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
174
175 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
176 return false;
177
178 if (dump_enabled_p ())
179 {
180 dump_printf_loc (MSG_NOTE, vect_location,
181 "mark for run-time aliasing test between ");
182 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
183 dump_printf (MSG_NOTE, " and ");
184 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
185 dump_printf (MSG_NOTE, "\n");
186 }
187
188 if (optimize_loop_nest_for_size_p (loop))
189 {
190 if (dump_enabled_p ())
191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
192 "versioning not supported when optimizing"
193 " for size.\n");
194 return false;
195 }
196
197 /* FORNOW: We don't support versioning with outer-loop vectorization. */
198 if (loop->inner)
199 {
200 if (dump_enabled_p ())
201 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
202 "versioning not yet supported for outer-loops.\n");
203 return false;
204 }
205
206 /* FORNOW: We don't support creating runtime alias tests for non-constant
207 step. */
208 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
209 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
210 {
211 if (dump_enabled_p ())
212 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
213 "versioning not yet supported for non-constant "
214 "step\n");
215 return false;
216 }
217
218 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
219 return true;
220 }
221
222
223 /* Function vect_analyze_data_ref_dependence.
224
225 Return TRUE if there (might) exist a dependence between a memory-reference
226 DRA and a memory-reference DRB. When versioning for alias may check a
227 dependence at run-time, return FALSE. Adjust *MAX_VF according to
228 the data dependence. */
229
230 static bool
231 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
232 loop_vec_info loop_vinfo, int *max_vf)
233 {
234 unsigned int i;
235 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
236 struct data_reference *dra = DDR_A (ddr);
237 struct data_reference *drb = DDR_B (ddr);
238 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
239 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
240 lambda_vector dist_v;
241 unsigned int loop_depth;
242
243 /* In loop analysis all data references should be vectorizable. */
244 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
245 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
246 gcc_unreachable ();
247
248 /* Independent data accesses. */
249 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
250 return false;
251
252 if (dra == drb
253 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
254 return false;
255
256 /* Even if we have an anti-dependence then, as the vectorized loop covers at
257 least two scalar iterations, there is always also a true dependence.
258 As the vectorizer does not re-order loads and stores we can ignore
259 the anti-dependence if TBAA can disambiguate both DRs similar to the
260 case with known negative distance anti-dependences (positive
261 distance anti-dependences would violate TBAA constraints). */
262 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
263 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
264 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
265 get_alias_set (DR_REF (drb))))
266 return false;
267
268 /* Unknown data dependence. */
269 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
270 {
271 /* If user asserted safelen consecutive iterations can be
272 executed concurrently, assume independence. */
273 if (loop->safelen >= 2)
274 {
275 if (loop->safelen < *max_vf)
276 *max_vf = loop->safelen;
277 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
278 return false;
279 }
280
281 if (STMT_VINFO_GATHER_P (stmtinfo_a)
282 || STMT_VINFO_GATHER_P (stmtinfo_b))
283 {
284 if (dump_enabled_p ())
285 {
286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
287 "versioning for alias not supported for: "
288 "can't determine dependence between ");
289 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
290 DR_REF (dra));
291 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
292 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
293 DR_REF (drb));
294 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
295 }
296 return true;
297 }
298
299 if (dump_enabled_p ())
300 {
301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
302 "versioning for alias required: "
303 "can't determine dependence between ");
304 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
305 DR_REF (dra));
306 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
308 DR_REF (drb));
309 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
310 }
311
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
314 }
315
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr) == 0)
318 {
319 /* If user asserted safelen consecutive iterations can be
320 executed concurrently, assume independence. */
321 if (loop->safelen >= 2)
322 {
323 if (loop->safelen < *max_vf)
324 *max_vf = loop->safelen;
325 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
326 return false;
327 }
328
329 if (STMT_VINFO_GATHER_P (stmtinfo_a)
330 || STMT_VINFO_GATHER_P (stmtinfo_b))
331 {
332 if (dump_enabled_p ())
333 {
334 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
335 "versioning for alias not supported for: "
336 "bad dist vector for ");
337 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
338 DR_REF (dra));
339 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
340 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
341 DR_REF (drb));
342 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
343 }
344 return true;
345 }
346
347 if (dump_enabled_p ())
348 {
349 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
350 "versioning for alias required: "
351 "bad dist vector for ");
352 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
353 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
354 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
355 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
356 }
357 /* Add to list of ddrs that need to be tested at run-time. */
358 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
359 }
360
361 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
362 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
363 {
364 int dist = dist_v[loop_depth];
365
366 if (dump_enabled_p ())
367 dump_printf_loc (MSG_NOTE, vect_location,
368 "dependence distance = %d.\n", dist);
369
370 if (dist == 0)
371 {
372 if (dump_enabled_p ())
373 {
374 dump_printf_loc (MSG_NOTE, vect_location,
375 "dependence distance == 0 between ");
376 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
377 dump_printf (MSG_NOTE, " and ");
378 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
379 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
380 }
381
382 /* When we perform grouped accesses and perform implicit CSE
383 by detecting equal accesses and doing disambiguation with
384 runtime alias tests like for
385 .. = a[i];
386 .. = a[i+1];
387 a[i] = ..;
388 a[i+1] = ..;
389 *p = ..;
390 .. = a[i];
391 .. = a[i+1];
392 where we will end up loading { a[i], a[i+1] } once, make
393 sure that inserting group loads before the first load and
394 stores after the last store will do the right thing.
395 Similar for groups like
396 a[i] = ...;
397 ... = a[i];
398 a[i+1] = ...;
399 where loads from the group interleave with the store. */
400 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
401 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
402 {
403 gimple earlier_stmt;
404 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
405 if (DR_IS_WRITE
406 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
407 {
408 if (dump_enabled_p ())
409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "READ_WRITE dependence in interleaving."
411 "\n");
412 return true;
413 }
414 }
415
416 continue;
417 }
418
419 if (dist > 0 && DDR_REVERSED_P (ddr))
420 {
421 /* If DDR_REVERSED_P the order of the data-refs in DDR was
422 reversed (to make distance vector positive), and the actual
423 distance is negative. */
424 if (dump_enabled_p ())
425 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
426 "dependence distance negative.\n");
427 /* Record a negative dependence distance to later limit the
428 amount of stmt copying / unrolling we can perform.
429 Only need to handle read-after-write dependence. */
430 if (DR_IS_READ (drb)
431 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
432 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
433 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
434 continue;
435 }
436
437 if (abs (dist) >= 2
438 && abs (dist) < *max_vf)
439 {
440 /* The dependence distance requires reduction of the maximal
441 vectorization factor. */
442 *max_vf = abs (dist);
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "adjusting maximal vectorization factor to %i\n",
446 *max_vf);
447 }
448
449 if (abs (dist) >= *max_vf)
450 {
451 /* Dependence distance does not create dependence, as far as
452 vectorization is concerned, in this case. */
453 if (dump_enabled_p ())
454 dump_printf_loc (MSG_NOTE, vect_location,
455 "dependence distance >= VF.\n");
456 continue;
457 }
458
459 if (dump_enabled_p ())
460 {
461 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
462 "not vectorized, possible dependence "
463 "between data-refs ");
464 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
465 dump_printf (MSG_NOTE, " and ");
466 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
467 dump_printf (MSG_NOTE, "\n");
468 }
469
470 return true;
471 }
472
473 return false;
474 }
475
476 /* Function vect_analyze_data_ref_dependences.
477
478 Examine all the data references in the loop, and make sure there do not
479 exist any data dependences between them. Set *MAX_VF according to
480 the maximum vectorization factor the data dependences allow. */
481
482 bool
483 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
484 {
485 unsigned int i;
486 struct data_dependence_relation *ddr;
487
488 if (dump_enabled_p ())
489 dump_printf_loc (MSG_NOTE, vect_location,
490 "=== vect_analyze_data_ref_dependences ===\n");
491
492 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
493 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
494 &LOOP_VINFO_DDRS (loop_vinfo),
495 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
496 return false;
497
498 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
499 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
500 return false;
501
502 return true;
503 }
504
505
506 /* Function vect_slp_analyze_data_ref_dependence.
507
508 Return TRUE if there (might) exist a dependence between a memory-reference
509 DRA and a memory-reference DRB. When versioning for alias may check a
510 dependence at run-time, return FALSE. Adjust *MAX_VF according to
511 the data dependence. */
512
513 static bool
514 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
515 {
516 struct data_reference *dra = DDR_A (ddr);
517 struct data_reference *drb = DDR_B (ddr);
518
519 /* We need to check dependences of statements marked as unvectorizable
520 as well, they still can prohibit vectorization. */
521
522 /* Independent data accesses. */
523 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
524 return false;
525
526 if (dra == drb)
527 return false;
528
529 /* Read-read is OK. */
530 if (DR_IS_READ (dra) && DR_IS_READ (drb))
531 return false;
532
533 /* If dra and drb are part of the same interleaving chain consider
534 them independent. */
535 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
536 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
537 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
538 return false;
539
540 /* Unknown data dependence. */
541 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
542 {
543 if (dump_enabled_p ())
544 {
545 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
546 "can't determine dependence between ");
547 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
548 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
549 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
550 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
551 }
552 }
553 else if (dump_enabled_p ())
554 {
555 dump_printf_loc (MSG_NOTE, vect_location,
556 "determined dependence between ");
557 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
558 dump_printf (MSG_NOTE, " and ");
559 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
560 dump_printf (MSG_NOTE, "\n");
561 }
562
563 /* We do not vectorize basic blocks with write-write dependencies. */
564 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
565 return true;
566
567 /* If we have a read-write dependence check that the load is before the store.
568 When we vectorize basic blocks, vector load can be only before
569 corresponding scalar load, and vector store can be only after its
570 corresponding scalar store. So the order of the acceses is preserved in
571 case the load is before the store. */
572 gimple earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
573 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
574 {
575 /* That only holds for load-store pairs taking part in vectorization. */
576 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
577 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
578 return false;
579 }
580
581 return true;
582 }
583
584
585 /* Function vect_analyze_data_ref_dependences.
586
587 Examine all the data references in the basic-block, and make sure there
588 do not exist any data dependences between them. Set *MAX_VF according to
589 the maximum vectorization factor the data dependences allow. */
590
591 bool
592 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
593 {
594 struct data_dependence_relation *ddr;
595 unsigned int i;
596
597 if (dump_enabled_p ())
598 dump_printf_loc (MSG_NOTE, vect_location,
599 "=== vect_slp_analyze_data_ref_dependences ===\n");
600
601 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
602 &BB_VINFO_DDRS (bb_vinfo),
603 vNULL, true))
604 return false;
605
606 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
607 if (vect_slp_analyze_data_ref_dependence (ddr))
608 return false;
609
610 return true;
611 }
612
613
614 /* Function vect_compute_data_ref_alignment
615
616 Compute the misalignment of the data reference DR.
617
618 Output:
619 1. If during the misalignment computation it is found that the data reference
620 cannot be vectorized then false is returned.
621 2. DR_MISALIGNMENT (DR) is defined.
622
623 FOR NOW: No analysis is actually performed. Misalignment is calculated
624 only for trivial cases. TODO. */
625
626 static bool
627 vect_compute_data_ref_alignment (struct data_reference *dr)
628 {
629 gimple stmt = DR_STMT (dr);
630 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
631 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
632 struct loop *loop = NULL;
633 tree ref = DR_REF (dr);
634 tree vectype;
635 tree base, base_addr;
636 bool base_aligned;
637 tree misalign = NULL_TREE;
638 tree aligned_to;
639 unsigned HOST_WIDE_INT alignment;
640
641 if (dump_enabled_p ())
642 dump_printf_loc (MSG_NOTE, vect_location,
643 "vect_compute_data_ref_alignment:\n");
644
645 if (loop_vinfo)
646 loop = LOOP_VINFO_LOOP (loop_vinfo);
647
648 /* Initialize misalignment to unknown. */
649 SET_DR_MISALIGNMENT (dr, -1);
650
651 /* Strided accesses perform only component accesses, misalignment information
652 is irrelevant for them. */
653 if (STMT_VINFO_STRIDED_P (stmt_info)
654 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
655 return true;
656
657 if (tree_fits_shwi_p (DR_STEP (dr)))
658 misalign = DR_INIT (dr);
659 aligned_to = DR_ALIGNED_TO (dr);
660 base_addr = DR_BASE_ADDRESS (dr);
661 vectype = STMT_VINFO_VECTYPE (stmt_info);
662
663 /* In case the dataref is in an inner-loop of the loop that is being
664 vectorized (LOOP), we use the base and misalignment information
665 relative to the outer-loop (LOOP). This is ok only if the misalignment
666 stays the same throughout the execution of the inner-loop, which is why
667 we have to check that the stride of the dataref in the inner-loop evenly
668 divides by the vector size. */
669 if (loop && nested_in_vect_loop_p (loop, stmt))
670 {
671 tree step = DR_STEP (dr);
672
673 if (tree_fits_shwi_p (step)
674 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
675 {
676 if (dump_enabled_p ())
677 dump_printf_loc (MSG_NOTE, vect_location,
678 "inner step divides the vector-size.\n");
679 misalign = STMT_VINFO_DR_INIT (stmt_info);
680 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
681 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
682 }
683 else
684 {
685 if (dump_enabled_p ())
686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
687 "inner step doesn't divide the vector-size.\n");
688 misalign = NULL_TREE;
689 }
690 }
691
692 /* Similarly we can only use base and misalignment information relative to
693 an innermost loop if the misalignment stays the same throughout the
694 execution of the loop. As above, this is the case if the stride of
695 the dataref evenly divides by the vector size. */
696 else
697 {
698 tree step = DR_STEP (dr);
699 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
700
701 if (tree_fits_shwi_p (step)
702 && ((tree_to_shwi (step) * vf)
703 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
704 {
705 if (dump_enabled_p ())
706 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
707 "step doesn't divide the vector-size.\n");
708 misalign = NULL_TREE;
709 }
710 }
711
712 alignment = TYPE_ALIGN_UNIT (vectype);
713
714 if ((compare_tree_int (aligned_to, alignment) < 0)
715 || !misalign)
716 {
717 if (dump_enabled_p ())
718 {
719 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
720 "Unknown alignment for access: ");
721 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
722 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
723 }
724 return true;
725 }
726
727 /* To look at alignment of the base we have to preserve an inner MEM_REF
728 as that carries alignment information of the actual access. */
729 base = ref;
730 while (handled_component_p (base))
731 base = TREE_OPERAND (base, 0);
732 if (TREE_CODE (base) == MEM_REF)
733 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
734 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
735
736 if (get_object_alignment (base) >= TYPE_ALIGN (vectype))
737 base_aligned = true;
738 else
739 base_aligned = false;
740
741 if (!base_aligned)
742 {
743 /* Strip an inner MEM_REF to a bare decl if possible. */
744 if (TREE_CODE (base) == MEM_REF
745 && integer_zerop (TREE_OPERAND (base, 1))
746 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
747 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
748
749 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
750 {
751 if (dump_enabled_p ())
752 {
753 dump_printf_loc (MSG_NOTE, vect_location,
754 "can't force alignment of ref: ");
755 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
756 dump_printf (MSG_NOTE, "\n");
757 }
758 return true;
759 }
760
761 /* Force the alignment of the decl.
762 NOTE: This is the only change to the code we make during
763 the analysis phase, before deciding to vectorize the loop. */
764 if (dump_enabled_p ())
765 {
766 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
767 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
768 dump_printf (MSG_NOTE, "\n");
769 }
770
771 ((dataref_aux *)dr->aux)->base_decl = base;
772 ((dataref_aux *)dr->aux)->base_misaligned = true;
773 }
774
775 /* If this is a backward running DR then first access in the larger
776 vectype actually is N-1 elements before the address in the DR.
777 Adjust misalign accordingly. */
778 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
779 {
780 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
781 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
782 otherwise we wouldn't be here. */
783 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
784 /* PLUS because DR_STEP was negative. */
785 misalign = size_binop (PLUS_EXPR, misalign, offset);
786 }
787
788 SET_DR_MISALIGNMENT (dr,
789 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
790
791 if (dump_enabled_p ())
792 {
793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
794 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
795 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
796 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
797 }
798
799 return true;
800 }
801
802
803 /* Function vect_compute_data_refs_alignment
804
805 Compute the misalignment of data references in the loop.
806 Return FALSE if a data reference is found that cannot be vectorized. */
807
808 static bool
809 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
810 bb_vec_info bb_vinfo)
811 {
812 vec<data_reference_p> datarefs;
813 struct data_reference *dr;
814 unsigned int i;
815
816 if (loop_vinfo)
817 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
818 else
819 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
820
821 FOR_EACH_VEC_ELT (datarefs, i, dr)
822 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
823 && !vect_compute_data_ref_alignment (dr))
824 {
825 if (bb_vinfo)
826 {
827 /* Mark unsupported statement as unvectorizable. */
828 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
829 continue;
830 }
831 else
832 return false;
833 }
834
835 return true;
836 }
837
838
839 /* Function vect_update_misalignment_for_peel
840
841 DR - the data reference whose misalignment is to be adjusted.
842 DR_PEEL - the data reference whose misalignment is being made
843 zero in the vector loop by the peel.
844 NPEEL - the number of iterations in the peel loop if the misalignment
845 of DR_PEEL is known at compile time. */
846
847 static void
848 vect_update_misalignment_for_peel (struct data_reference *dr,
849 struct data_reference *dr_peel, int npeel)
850 {
851 unsigned int i;
852 vec<dr_p> same_align_drs;
853 struct data_reference *current_dr;
854 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
855 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
856 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
857 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
858
859 /* For interleaved data accesses the step in the loop must be multiplied by
860 the size of the interleaving group. */
861 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
862 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
863 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
864 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
865
866 /* It can be assumed that the data refs with the same alignment as dr_peel
867 are aligned in the vector loop. */
868 same_align_drs
869 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
870 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
871 {
872 if (current_dr != dr)
873 continue;
874 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
875 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
876 SET_DR_MISALIGNMENT (dr, 0);
877 return;
878 }
879
880 if (known_alignment_for_access_p (dr)
881 && known_alignment_for_access_p (dr_peel))
882 {
883 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
884 int misal = DR_MISALIGNMENT (dr);
885 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
886 misal += negative ? -npeel * dr_size : npeel * dr_size;
887 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
888 SET_DR_MISALIGNMENT (dr, misal);
889 return;
890 }
891
892 if (dump_enabled_p ())
893 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
894 SET_DR_MISALIGNMENT (dr, -1);
895 }
896
897
898 /* Function vect_verify_datarefs_alignment
899
900 Return TRUE if all data references in the loop can be
901 handled with respect to alignment. */
902
903 bool
904 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
905 {
906 vec<data_reference_p> datarefs;
907 struct data_reference *dr;
908 enum dr_alignment_support supportable_dr_alignment;
909 unsigned int i;
910
911 if (loop_vinfo)
912 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
913 else
914 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
915
916 FOR_EACH_VEC_ELT (datarefs, i, dr)
917 {
918 gimple stmt = DR_STMT (dr);
919 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
920
921 if (!STMT_VINFO_RELEVANT_P (stmt_info))
922 continue;
923
924 /* For interleaving, only the alignment of the first access matters.
925 Skip statements marked as not vectorizable. */
926 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
927 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
928 || !STMT_VINFO_VECTORIZABLE (stmt_info))
929 continue;
930
931 /* Strided accesses perform only component accesses, alignment is
932 irrelevant for them. */
933 if (STMT_VINFO_STRIDED_P (stmt_info)
934 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
935 continue;
936
937 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
938 if (!supportable_dr_alignment)
939 {
940 if (dump_enabled_p ())
941 {
942 if (DR_IS_READ (dr))
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned load.");
945 else
946 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
947 "not vectorized: unsupported unaligned "
948 "store.");
949
950 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
951 DR_REF (dr));
952 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
953 }
954 return false;
955 }
956 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
957 dump_printf_loc (MSG_NOTE, vect_location,
958 "Vectorizing an unaligned access.\n");
959 }
960 return true;
961 }
962
963 /* Given an memory reference EXP return whether its alignment is less
964 than its size. */
965
966 static bool
967 not_size_aligned (tree exp)
968 {
969 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
970 return true;
971
972 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
973 > get_object_alignment (exp));
974 }
975
976 /* Function vector_alignment_reachable_p
977
978 Return true if vector alignment for DR is reachable by peeling
979 a few loop iterations. Return false otherwise. */
980
981 static bool
982 vector_alignment_reachable_p (struct data_reference *dr)
983 {
984 gimple stmt = DR_STMT (dr);
985 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
986 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
987
988 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
989 {
990 /* For interleaved access we peel only if number of iterations in
991 the prolog loop ({VF - misalignment}), is a multiple of the
992 number of the interleaved accesses. */
993 int elem_size, mis_in_elements;
994 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
995
996 /* FORNOW: handle only known alignment. */
997 if (!known_alignment_for_access_p (dr))
998 return false;
999
1000 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1001 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1002
1003 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1004 return false;
1005 }
1006
1007 /* If misalignment is known at the compile time then allow peeling
1008 only if natural alignment is reachable through peeling. */
1009 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1010 {
1011 HOST_WIDE_INT elmsize =
1012 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1013 if (dump_enabled_p ())
1014 {
1015 dump_printf_loc (MSG_NOTE, vect_location,
1016 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1017 dump_printf (MSG_NOTE,
1018 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1019 }
1020 if (DR_MISALIGNMENT (dr) % elmsize)
1021 {
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1024 "data size does not divide the misalignment.\n");
1025 return false;
1026 }
1027 }
1028
1029 if (!known_alignment_for_access_p (dr))
1030 {
1031 tree type = TREE_TYPE (DR_REF (dr));
1032 bool is_packed = not_size_aligned (DR_REF (dr));
1033 if (dump_enabled_p ())
1034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1035 "Unknown misalignment, is_packed = %d\n",is_packed);
1036 if ((TYPE_USER_ALIGN (type) && !is_packed)
1037 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1038 return true;
1039 else
1040 return false;
1041 }
1042
1043 return true;
1044 }
1045
1046
1047 /* Calculate the cost of the memory access represented by DR. */
1048
1049 static void
1050 vect_get_data_access_cost (struct data_reference *dr,
1051 unsigned int *inside_cost,
1052 unsigned int *outside_cost,
1053 stmt_vector_for_cost *body_cost_vec)
1054 {
1055 gimple stmt = DR_STMT (dr);
1056 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1057 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1058 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1059 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1060 int ncopies = vf / nunits;
1061
1062 if (DR_IS_READ (dr))
1063 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1064 NULL, body_cost_vec, false);
1065 else
1066 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1067
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_NOTE, vect_location,
1070 "vect_get_data_access_cost: inside_cost = %d, "
1071 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1072 }
1073
1074
1075 /* Insert DR into peeling hash table with NPEEL as key. */
1076
1077 static void
1078 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1079 int npeel)
1080 {
1081 struct _vect_peel_info elem, *slot;
1082 _vect_peel_info **new_slot;
1083 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1084
1085 elem.npeel = npeel;
1086 slot = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find (&elem);
1087 if (slot)
1088 slot->count++;
1089 else
1090 {
1091 slot = XNEW (struct _vect_peel_info);
1092 slot->npeel = npeel;
1093 slot->dr = dr;
1094 slot->count = 1;
1095 new_slot
1096 = LOOP_VINFO_PEELING_HTAB (loop_vinfo)->find_slot (slot, INSERT);
1097 *new_slot = slot;
1098 }
1099
1100 if (!supportable_dr_alignment
1101 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1102 slot->count += VECT_MAX_COST;
1103 }
1104
1105
1106 /* Traverse peeling hash table to find peeling option that aligns maximum
1107 number of data accesses. */
1108
1109 int
1110 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1111 _vect_peel_extended_info *max)
1112 {
1113 vect_peel_info elem = *slot;
1114
1115 if (elem->count > max->peel_info.count
1116 || (elem->count == max->peel_info.count
1117 && max->peel_info.npeel > elem->npeel))
1118 {
1119 max->peel_info.npeel = elem->npeel;
1120 max->peel_info.count = elem->count;
1121 max->peel_info.dr = elem->dr;
1122 }
1123
1124 return 1;
1125 }
1126
1127
1128 /* Traverse peeling hash table and calculate cost for each peeling option.
1129 Find the one with the lowest cost. */
1130
1131 int
1132 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1133 _vect_peel_extended_info *min)
1134 {
1135 vect_peel_info elem = *slot;
1136 int save_misalignment, dummy;
1137 unsigned int inside_cost = 0, outside_cost = 0, i;
1138 gimple stmt = DR_STMT (elem->dr);
1139 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1140 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1141 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1142 struct data_reference *dr;
1143 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1144
1145 prologue_cost_vec.create (2);
1146 body_cost_vec.create (2);
1147 epilogue_cost_vec.create (2);
1148
1149 FOR_EACH_VEC_ELT (datarefs, i, dr)
1150 {
1151 stmt = DR_STMT (dr);
1152 stmt_info = vinfo_for_stmt (stmt);
1153 /* For interleaving, only the alignment of the first access
1154 matters. */
1155 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1156 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1157 continue;
1158
1159 save_misalignment = DR_MISALIGNMENT (dr);
1160 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1161 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1162 &body_cost_vec);
1163 SET_DR_MISALIGNMENT (dr, save_misalignment);
1164 }
1165
1166 outside_cost += vect_get_known_peeling_cost
1167 (loop_vinfo, elem->npeel, &dummy,
1168 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1169 &prologue_cost_vec, &epilogue_cost_vec);
1170
1171 /* Prologue and epilogue costs are added to the target model later.
1172 These costs depend only on the scalar iteration cost, the
1173 number of peeling iterations finally chosen, and the number of
1174 misaligned statements. So discard the information found here. */
1175 prologue_cost_vec.release ();
1176 epilogue_cost_vec.release ();
1177
1178 if (inside_cost < min->inside_cost
1179 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1180 {
1181 min->inside_cost = inside_cost;
1182 min->outside_cost = outside_cost;
1183 min->body_cost_vec.release ();
1184 min->body_cost_vec = body_cost_vec;
1185 min->peel_info.dr = elem->dr;
1186 min->peel_info.npeel = elem->npeel;
1187 }
1188 else
1189 body_cost_vec.release ();
1190
1191 return 1;
1192 }
1193
1194
1195 /* Choose best peeling option by traversing peeling hash table and either
1196 choosing an option with the lowest cost (if cost model is enabled) or the
1197 option that aligns as many accesses as possible. */
1198
1199 static struct data_reference *
1200 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1201 unsigned int *npeel,
1202 stmt_vector_for_cost *body_cost_vec)
1203 {
1204 struct _vect_peel_extended_info res;
1205
1206 res.peel_info.dr = NULL;
1207 res.body_cost_vec = stmt_vector_for_cost ();
1208
1209 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1210 {
1211 res.inside_cost = INT_MAX;
1212 res.outside_cost = INT_MAX;
1213 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1214 ->traverse <_vect_peel_extended_info *,
1215 vect_peeling_hash_get_lowest_cost> (&res);
1216 }
1217 else
1218 {
1219 res.peel_info.count = 0;
1220 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1221 ->traverse <_vect_peel_extended_info *,
1222 vect_peeling_hash_get_most_frequent> (&res);
1223 }
1224
1225 *npeel = res.peel_info.npeel;
1226 *body_cost_vec = res.body_cost_vec;
1227 return res.peel_info.dr;
1228 }
1229
1230
1231 /* Function vect_enhance_data_refs_alignment
1232
1233 This pass will use loop versioning and loop peeling in order to enhance
1234 the alignment of data references in the loop.
1235
1236 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1237 original loop is to be vectorized. Any other loops that are created by
1238 the transformations performed in this pass - are not supposed to be
1239 vectorized. This restriction will be relaxed.
1240
1241 This pass will require a cost model to guide it whether to apply peeling
1242 or versioning or a combination of the two. For example, the scheme that
1243 intel uses when given a loop with several memory accesses, is as follows:
1244 choose one memory access ('p') which alignment you want to force by doing
1245 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1246 other accesses are not necessarily aligned, or (2) use loop versioning to
1247 generate one loop in which all accesses are aligned, and another loop in
1248 which only 'p' is necessarily aligned.
1249
1250 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1251 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1252 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1253
1254 Devising a cost model is the most critical aspect of this work. It will
1255 guide us on which access to peel for, whether to use loop versioning, how
1256 many versions to create, etc. The cost model will probably consist of
1257 generic considerations as well as target specific considerations (on
1258 powerpc for example, misaligned stores are more painful than misaligned
1259 loads).
1260
1261 Here are the general steps involved in alignment enhancements:
1262
1263 -- original loop, before alignment analysis:
1264 for (i=0; i<N; i++){
1265 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1266 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1267 }
1268
1269 -- After vect_compute_data_refs_alignment:
1270 for (i=0; i<N; i++){
1271 x = q[i]; # DR_MISALIGNMENT(q) = 3
1272 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1273 }
1274
1275 -- Possibility 1: we do loop versioning:
1276 if (p is aligned) {
1277 for (i=0; i<N; i++){ # loop 1A
1278 x = q[i]; # DR_MISALIGNMENT(q) = 3
1279 p[i] = y; # DR_MISALIGNMENT(p) = 0
1280 }
1281 }
1282 else {
1283 for (i=0; i<N; i++){ # loop 1B
1284 x = q[i]; # DR_MISALIGNMENT(q) = 3
1285 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1286 }
1287 }
1288
1289 -- Possibility 2: we do loop peeling:
1290 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1291 x = q[i];
1292 p[i] = y;
1293 }
1294 for (i = 3; i < N; i++){ # loop 2A
1295 x = q[i]; # DR_MISALIGNMENT(q) = 0
1296 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1297 }
1298
1299 -- Possibility 3: combination of loop peeling and versioning:
1300 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1301 x = q[i];
1302 p[i] = y;
1303 }
1304 if (p is aligned) {
1305 for (i = 3; i<N; i++){ # loop 3A
1306 x = q[i]; # DR_MISALIGNMENT(q) = 0
1307 p[i] = y; # DR_MISALIGNMENT(p) = 0
1308 }
1309 }
1310 else {
1311 for (i = 3; i<N; i++){ # loop 3B
1312 x = q[i]; # DR_MISALIGNMENT(q) = 0
1313 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1314 }
1315 }
1316
1317 These loops are later passed to loop_transform to be vectorized. The
1318 vectorizer will use the alignment information to guide the transformation
1319 (whether to generate regular loads/stores, or with special handling for
1320 misalignment). */
1321
1322 bool
1323 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1324 {
1325 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1326 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1327 enum dr_alignment_support supportable_dr_alignment;
1328 struct data_reference *dr0 = NULL, *first_store = NULL;
1329 struct data_reference *dr;
1330 unsigned int i, j;
1331 bool do_peeling = false;
1332 bool do_versioning = false;
1333 bool stat;
1334 gimple stmt;
1335 stmt_vec_info stmt_info;
1336 unsigned int npeel = 0;
1337 bool all_misalignments_unknown = true;
1338 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1339 unsigned possible_npeel_number = 1;
1340 tree vectype;
1341 unsigned int nelements, mis, same_align_drs_max = 0;
1342 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1343
1344 if (dump_enabled_p ())
1345 dump_printf_loc (MSG_NOTE, vect_location,
1346 "=== vect_enhance_data_refs_alignment ===\n");
1347
1348 /* While cost model enhancements are expected in the future, the high level
1349 view of the code at this time is as follows:
1350
1351 A) If there is a misaligned access then see if peeling to align
1352 this access can make all data references satisfy
1353 vect_supportable_dr_alignment. If so, update data structures
1354 as needed and return true.
1355
1356 B) If peeling wasn't possible and there is a data reference with an
1357 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1358 then see if loop versioning checks can be used to make all data
1359 references satisfy vect_supportable_dr_alignment. If so, update
1360 data structures as needed and return true.
1361
1362 C) If neither peeling nor versioning were successful then return false if
1363 any data reference does not satisfy vect_supportable_dr_alignment.
1364
1365 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1366
1367 Note, Possibility 3 above (which is peeling and versioning together) is not
1368 being done at this time. */
1369
1370 /* (1) Peeling to force alignment. */
1371
1372 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1373 Considerations:
1374 + How many accesses will become aligned due to the peeling
1375 - How many accesses will become unaligned due to the peeling,
1376 and the cost of misaligned accesses.
1377 - The cost of peeling (the extra runtime checks, the increase
1378 in code size). */
1379
1380 FOR_EACH_VEC_ELT (datarefs, i, dr)
1381 {
1382 stmt = DR_STMT (dr);
1383 stmt_info = vinfo_for_stmt (stmt);
1384
1385 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1386 continue;
1387
1388 /* For interleaving, only the alignment of the first access
1389 matters. */
1390 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1391 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1392 continue;
1393
1394 /* For invariant accesses there is nothing to enhance. */
1395 if (integer_zerop (DR_STEP (dr)))
1396 continue;
1397
1398 /* Strided accesses perform only component accesses, alignment is
1399 irrelevant for them. */
1400 if (STMT_VINFO_STRIDED_P (stmt_info)
1401 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1402 continue;
1403
1404 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1405 do_peeling = vector_alignment_reachable_p (dr);
1406 if (do_peeling)
1407 {
1408 if (known_alignment_for_access_p (dr))
1409 {
1410 unsigned int npeel_tmp;
1411 bool negative = tree_int_cst_compare (DR_STEP (dr),
1412 size_zero_node) < 0;
1413
1414 /* Save info about DR in the hash table. */
1415 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1416 LOOP_VINFO_PEELING_HTAB (loop_vinfo)
1417 = new hash_table<peel_info_hasher> (1);
1418
1419 vectype = STMT_VINFO_VECTYPE (stmt_info);
1420 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1421 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1422 TREE_TYPE (DR_REF (dr))));
1423 npeel_tmp = (negative
1424 ? (mis - nelements) : (nelements - mis))
1425 & (nelements - 1);
1426
1427 /* For multiple types, it is possible that the bigger type access
1428 will have more than one peeling option. E.g., a loop with two
1429 types: one of size (vector size / 4), and the other one of
1430 size (vector size / 8). Vectorization factor will 8. If both
1431 access are misaligned by 3, the first one needs one scalar
1432 iteration to be aligned, and the second one needs 5. But the
1433 the first one will be aligned also by peeling 5 scalar
1434 iterations, and in that case both accesses will be aligned.
1435 Hence, except for the immediate peeling amount, we also want
1436 to try to add full vector size, while we don't exceed
1437 vectorization factor.
1438 We do this automtically for cost model, since we calculate cost
1439 for every peeling option. */
1440 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1441 {
1442 if (STMT_SLP_TYPE (stmt_info))
1443 possible_npeel_number
1444 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1445 else
1446 possible_npeel_number = vf / nelements;
1447 }
1448
1449 /* Handle the aligned case. We may decide to align some other
1450 access, making DR unaligned. */
1451 if (DR_MISALIGNMENT (dr) == 0)
1452 {
1453 npeel_tmp = 0;
1454 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1455 possible_npeel_number++;
1456 }
1457
1458 for (j = 0; j < possible_npeel_number; j++)
1459 {
1460 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1461 npeel_tmp += nelements;
1462 }
1463
1464 all_misalignments_unknown = false;
1465 /* Data-ref that was chosen for the case that all the
1466 misalignments are unknown is not relevant anymore, since we
1467 have a data-ref with known alignment. */
1468 dr0 = NULL;
1469 }
1470 else
1471 {
1472 /* If we don't know any misalignment values, we prefer
1473 peeling for data-ref that has the maximum number of data-refs
1474 with the same alignment, unless the target prefers to align
1475 stores over load. */
1476 if (all_misalignments_unknown)
1477 {
1478 unsigned same_align_drs
1479 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1480 if (!dr0
1481 || same_align_drs_max < same_align_drs)
1482 {
1483 same_align_drs_max = same_align_drs;
1484 dr0 = dr;
1485 }
1486 /* For data-refs with the same number of related
1487 accesses prefer the one where the misalign
1488 computation will be invariant in the outermost loop. */
1489 else if (same_align_drs_max == same_align_drs)
1490 {
1491 struct loop *ivloop0, *ivloop;
1492 ivloop0 = outermost_invariant_loop_for_expr
1493 (loop, DR_BASE_ADDRESS (dr0));
1494 ivloop = outermost_invariant_loop_for_expr
1495 (loop, DR_BASE_ADDRESS (dr));
1496 if ((ivloop && !ivloop0)
1497 || (ivloop && ivloop0
1498 && flow_loop_nested_p (ivloop, ivloop0)))
1499 dr0 = dr;
1500 }
1501
1502 if (!first_store && DR_IS_WRITE (dr))
1503 first_store = dr;
1504 }
1505
1506 /* If there are both known and unknown misaligned accesses in the
1507 loop, we choose peeling amount according to the known
1508 accesses. */
1509 if (!supportable_dr_alignment)
1510 {
1511 dr0 = dr;
1512 if (!first_store && DR_IS_WRITE (dr))
1513 first_store = dr;
1514 }
1515 }
1516 }
1517 else
1518 {
1519 if (!aligned_access_p (dr))
1520 {
1521 if (dump_enabled_p ())
1522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523 "vector alignment may not be reachable\n");
1524 break;
1525 }
1526 }
1527 }
1528
1529 /* Check if we can possibly peel the loop. */
1530 if (!vect_can_advance_ivs_p (loop_vinfo)
1531 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1532 do_peeling = false;
1533
1534 if (do_peeling
1535 && all_misalignments_unknown
1536 && vect_supportable_dr_alignment (dr0, false))
1537 {
1538 /* Check if the target requires to prefer stores over loads, i.e., if
1539 misaligned stores are more expensive than misaligned loads (taking
1540 drs with same alignment into account). */
1541 if (first_store && DR_IS_READ (dr0))
1542 {
1543 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1544 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1545 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1546 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1547 stmt_vector_for_cost dummy;
1548 dummy.create (2);
1549
1550 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1551 &dummy);
1552 vect_get_data_access_cost (first_store, &store_inside_cost,
1553 &store_outside_cost, &dummy);
1554
1555 dummy.release ();
1556
1557 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1558 aligning the load DR0). */
1559 load_inside_penalty = store_inside_cost;
1560 load_outside_penalty = store_outside_cost;
1561 for (i = 0;
1562 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1563 DR_STMT (first_store))).iterate (i, &dr);
1564 i++)
1565 if (DR_IS_READ (dr))
1566 {
1567 load_inside_penalty += load_inside_cost;
1568 load_outside_penalty += load_outside_cost;
1569 }
1570 else
1571 {
1572 load_inside_penalty += store_inside_cost;
1573 load_outside_penalty += store_outside_cost;
1574 }
1575
1576 /* Calculate the penalty for leaving DR0 unaligned (by
1577 aligning the FIRST_STORE). */
1578 store_inside_penalty = load_inside_cost;
1579 store_outside_penalty = load_outside_cost;
1580 for (i = 0;
1581 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1582 DR_STMT (dr0))).iterate (i, &dr);
1583 i++)
1584 if (DR_IS_READ (dr))
1585 {
1586 store_inside_penalty += load_inside_cost;
1587 store_outside_penalty += load_outside_cost;
1588 }
1589 else
1590 {
1591 store_inside_penalty += store_inside_cost;
1592 store_outside_penalty += store_outside_cost;
1593 }
1594
1595 if (load_inside_penalty > store_inside_penalty
1596 || (load_inside_penalty == store_inside_penalty
1597 && load_outside_penalty > store_outside_penalty))
1598 dr0 = first_store;
1599 }
1600
1601 /* In case there are only loads with different unknown misalignments, use
1602 peeling only if it may help to align other accesses in the loop or
1603 if it may help improving load bandwith when we'd end up using
1604 unaligned loads. */
1605 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1606 if (!first_store
1607 && !STMT_VINFO_SAME_ALIGN_REFS (
1608 vinfo_for_stmt (DR_STMT (dr0))).length ()
1609 && (vect_supportable_dr_alignment (dr0, false)
1610 != dr_unaligned_supported
1611 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1612 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1613 do_peeling = false;
1614 }
1615
1616 if (do_peeling && !dr0)
1617 {
1618 /* Peeling is possible, but there is no data access that is not supported
1619 unless aligned. So we try to choose the best possible peeling. */
1620
1621 /* We should get here only if there are drs with known misalignment. */
1622 gcc_assert (!all_misalignments_unknown);
1623
1624 /* Choose the best peeling from the hash table. */
1625 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1626 &body_cost_vec);
1627 if (!dr0 || !npeel)
1628 do_peeling = false;
1629 }
1630
1631 if (do_peeling)
1632 {
1633 stmt = DR_STMT (dr0);
1634 stmt_info = vinfo_for_stmt (stmt);
1635 vectype = STMT_VINFO_VECTYPE (stmt_info);
1636 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1637
1638 if (known_alignment_for_access_p (dr0))
1639 {
1640 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1641 size_zero_node) < 0;
1642 if (!npeel)
1643 {
1644 /* Since it's known at compile time, compute the number of
1645 iterations in the peeled loop (the peeling factor) for use in
1646 updating DR_MISALIGNMENT values. The peeling factor is the
1647 vectorization factor minus the misalignment as an element
1648 count. */
1649 mis = DR_MISALIGNMENT (dr0);
1650 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1651 npeel = ((negative ? mis - nelements : nelements - mis)
1652 & (nelements - 1));
1653 }
1654
1655 /* For interleaved data access every iteration accesses all the
1656 members of the group, therefore we divide the number of iterations
1657 by the group size. */
1658 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1659 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1660 npeel /= GROUP_SIZE (stmt_info);
1661
1662 if (dump_enabled_p ())
1663 dump_printf_loc (MSG_NOTE, vect_location,
1664 "Try peeling by %d\n", npeel);
1665 }
1666
1667 /* Ensure that all data refs can be vectorized after the peel. */
1668 FOR_EACH_VEC_ELT (datarefs, i, dr)
1669 {
1670 int save_misalignment;
1671
1672 if (dr == dr0)
1673 continue;
1674
1675 stmt = DR_STMT (dr);
1676 stmt_info = vinfo_for_stmt (stmt);
1677 /* For interleaving, only the alignment of the first access
1678 matters. */
1679 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1680 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1681 continue;
1682
1683 /* Strided accesses perform only component accesses, alignment is
1684 irrelevant for them. */
1685 if (STMT_VINFO_STRIDED_P (stmt_info)
1686 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1687 continue;
1688
1689 save_misalignment = DR_MISALIGNMENT (dr);
1690 vect_update_misalignment_for_peel (dr, dr0, npeel);
1691 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1692 SET_DR_MISALIGNMENT (dr, save_misalignment);
1693
1694 if (!supportable_dr_alignment)
1695 {
1696 do_peeling = false;
1697 break;
1698 }
1699 }
1700
1701 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1702 {
1703 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1704 if (!stat)
1705 do_peeling = false;
1706 else
1707 {
1708 body_cost_vec.release ();
1709 return stat;
1710 }
1711 }
1712
1713 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1714 if (do_peeling)
1715 {
1716 unsigned max_allowed_peel
1717 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1718 if (max_allowed_peel != (unsigned)-1)
1719 {
1720 unsigned max_peel = npeel;
1721 if (max_peel == 0)
1722 {
1723 gimple dr_stmt = DR_STMT (dr0);
1724 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1725 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1726 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1727 }
1728 if (max_peel > max_allowed_peel)
1729 {
1730 do_peeling = false;
1731 if (dump_enabled_p ())
1732 dump_printf_loc (MSG_NOTE, vect_location,
1733 "Disable peeling, max peels reached: %d\n", max_peel);
1734 }
1735 }
1736 }
1737
1738 /* Cost model #2 - if peeling may result in a remaining loop not
1739 iterating enough to be vectorized then do not peel. */
1740 if (do_peeling
1741 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1742 {
1743 unsigned max_peel
1744 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1745 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1746 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1747 do_peeling = false;
1748 }
1749
1750 if (do_peeling)
1751 {
1752 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1753 If the misalignment of DR_i is identical to that of dr0 then set
1754 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1755 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1756 by the peeling factor times the element size of DR_i (MOD the
1757 vectorization factor times the size). Otherwise, the
1758 misalignment of DR_i must be set to unknown. */
1759 FOR_EACH_VEC_ELT (datarefs, i, dr)
1760 if (dr != dr0)
1761 vect_update_misalignment_for_peel (dr, dr0, npeel);
1762
1763 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1764 if (npeel)
1765 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1766 else
1767 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1768 = DR_MISALIGNMENT (dr0);
1769 SET_DR_MISALIGNMENT (dr0, 0);
1770 if (dump_enabled_p ())
1771 {
1772 dump_printf_loc (MSG_NOTE, vect_location,
1773 "Alignment of access forced using peeling.\n");
1774 dump_printf_loc (MSG_NOTE, vect_location,
1775 "Peeling for alignment will be applied.\n");
1776 }
1777 /* The inside-loop cost will be accounted for in vectorizable_load
1778 and vectorizable_store correctly with adjusted alignments.
1779 Drop the body_cst_vec on the floor here. */
1780 body_cost_vec.release ();
1781
1782 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1783 gcc_assert (stat);
1784 return stat;
1785 }
1786 }
1787
1788 body_cost_vec.release ();
1789
1790 /* (2) Versioning to force alignment. */
1791
1792 /* Try versioning if:
1793 1) optimize loop for speed
1794 2) there is at least one unsupported misaligned data ref with an unknown
1795 misalignment, and
1796 3) all misaligned data refs with a known misalignment are supported, and
1797 4) the number of runtime alignment checks is within reason. */
1798
1799 do_versioning =
1800 optimize_loop_nest_for_speed_p (loop)
1801 && (!loop->inner); /* FORNOW */
1802
1803 if (do_versioning)
1804 {
1805 FOR_EACH_VEC_ELT (datarefs, i, dr)
1806 {
1807 stmt = DR_STMT (dr);
1808 stmt_info = vinfo_for_stmt (stmt);
1809
1810 /* For interleaving, only the alignment of the first access
1811 matters. */
1812 if (aligned_access_p (dr)
1813 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1814 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1815 continue;
1816
1817 if (STMT_VINFO_STRIDED_P (stmt_info))
1818 {
1819 /* Strided loads perform only component accesses, alignment is
1820 irrelevant for them. */
1821 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1822 continue;
1823 do_versioning = false;
1824 break;
1825 }
1826
1827 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1828
1829 if (!supportable_dr_alignment)
1830 {
1831 gimple stmt;
1832 int mask;
1833 tree vectype;
1834
1835 if (known_alignment_for_access_p (dr)
1836 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1837 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1838 {
1839 do_versioning = false;
1840 break;
1841 }
1842
1843 stmt = DR_STMT (dr);
1844 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1845 gcc_assert (vectype);
1846
1847 /* The rightmost bits of an aligned address must be zeros.
1848 Construct the mask needed for this test. For example,
1849 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1850 mask must be 15 = 0xf. */
1851 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1852
1853 /* FORNOW: use the same mask to test all potentially unaligned
1854 references in the loop. The vectorizer currently supports
1855 a single vector size, see the reference to
1856 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1857 vectorization factor is computed. */
1858 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1859 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1860 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1861 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1862 DR_STMT (dr));
1863 }
1864 }
1865
1866 /* Versioning requires at least one misaligned data reference. */
1867 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1868 do_versioning = false;
1869 else if (!do_versioning)
1870 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1871 }
1872
1873 if (do_versioning)
1874 {
1875 vec<gimple> may_misalign_stmts
1876 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1877 gimple stmt;
1878
1879 /* It can now be assumed that the data references in the statements
1880 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1881 of the loop being vectorized. */
1882 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1883 {
1884 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1885 dr = STMT_VINFO_DATA_REF (stmt_info);
1886 SET_DR_MISALIGNMENT (dr, 0);
1887 if (dump_enabled_p ())
1888 dump_printf_loc (MSG_NOTE, vect_location,
1889 "Alignment of access forced using versioning.\n");
1890 }
1891
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_NOTE, vect_location,
1894 "Versioning for alignment will be applied.\n");
1895
1896 /* Peeling and versioning can't be done together at this time. */
1897 gcc_assert (! (do_peeling && do_versioning));
1898
1899 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1900 gcc_assert (stat);
1901 return stat;
1902 }
1903
1904 /* This point is reached if neither peeling nor versioning is being done. */
1905 gcc_assert (! (do_peeling || do_versioning));
1906
1907 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1908 return stat;
1909 }
1910
1911
1912 /* Function vect_find_same_alignment_drs.
1913
1914 Update group and alignment relations according to the chosen
1915 vectorization factor. */
1916
1917 static void
1918 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1919 loop_vec_info loop_vinfo)
1920 {
1921 unsigned int i;
1922 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1923 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1924 struct data_reference *dra = DDR_A (ddr);
1925 struct data_reference *drb = DDR_B (ddr);
1926 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1927 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1928 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1929 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1930 lambda_vector dist_v;
1931 unsigned int loop_depth;
1932
1933 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1934 return;
1935
1936 if (dra == drb)
1937 return;
1938
1939 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1940 return;
1941
1942 /* Loop-based vectorization and known data dependence. */
1943 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1944 return;
1945
1946 /* Data-dependence analysis reports a distance vector of zero
1947 for data-references that overlap only in the first iteration
1948 but have different sign step (see PR45764).
1949 So as a sanity check require equal DR_STEP. */
1950 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1951 return;
1952
1953 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1954 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1955 {
1956 int dist = dist_v[loop_depth];
1957
1958 if (dump_enabled_p ())
1959 dump_printf_loc (MSG_NOTE, vect_location,
1960 "dependence distance = %d.\n", dist);
1961
1962 /* Same loop iteration. */
1963 if (dist == 0
1964 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1965 {
1966 /* Two references with distance zero have the same alignment. */
1967 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1968 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1969 if (dump_enabled_p ())
1970 {
1971 dump_printf_loc (MSG_NOTE, vect_location,
1972 "accesses have the same alignment.\n");
1973 dump_printf (MSG_NOTE,
1974 "dependence distance modulo vf == 0 between ");
1975 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1976 dump_printf (MSG_NOTE, " and ");
1977 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1978 dump_printf (MSG_NOTE, "\n");
1979 }
1980 }
1981 }
1982 }
1983
1984
1985 /* Function vect_analyze_data_refs_alignment
1986
1987 Analyze the alignment of the data-references in the loop.
1988 Return FALSE if a data reference is found that cannot be vectorized. */
1989
1990 bool
1991 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1992 bb_vec_info bb_vinfo)
1993 {
1994 if (dump_enabled_p ())
1995 dump_printf_loc (MSG_NOTE, vect_location,
1996 "=== vect_analyze_data_refs_alignment ===\n");
1997
1998 /* Mark groups of data references with same alignment using
1999 data dependence information. */
2000 if (loop_vinfo)
2001 {
2002 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2003 struct data_dependence_relation *ddr;
2004 unsigned int i;
2005
2006 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2007 vect_find_same_alignment_drs (ddr, loop_vinfo);
2008 }
2009
2010 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2011 {
2012 if (dump_enabled_p ())
2013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2014 "not vectorized: can't calculate alignment "
2015 "for data ref.\n");
2016 return false;
2017 }
2018
2019 return true;
2020 }
2021
2022
2023 /* Analyze groups of accesses: check that DR belongs to a group of
2024 accesses of legal size, step, etc. Detect gaps, single element
2025 interleaving, and other special cases. Set grouped access info.
2026 Collect groups of strided stores for further use in SLP analysis. */
2027
2028 static bool
2029 vect_analyze_group_access (struct data_reference *dr)
2030 {
2031 tree step = DR_STEP (dr);
2032 tree scalar_type = TREE_TYPE (DR_REF (dr));
2033 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2034 gimple stmt = DR_STMT (dr);
2035 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2036 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2037 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2038 HOST_WIDE_INT dr_step = -1;
2039 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2040 bool slp_impossible = false;
2041 struct loop *loop = NULL;
2042
2043 if (loop_vinfo)
2044 loop = LOOP_VINFO_LOOP (loop_vinfo);
2045
2046 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2047 size of the interleaving group (including gaps). */
2048 if (tree_fits_shwi_p (step))
2049 {
2050 dr_step = tree_to_shwi (step);
2051 groupsize = absu_hwi (dr_step) / type_size;
2052 }
2053 else
2054 groupsize = 0;
2055
2056 /* Not consecutive access is possible only if it is a part of interleaving. */
2057 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2058 {
2059 /* Check if it this DR is a part of interleaving, and is a single
2060 element of the group that is accessed in the loop. */
2061
2062 /* Gaps are supported only for loads. STEP must be a multiple of the type
2063 size. The size of the group must be a power of 2. */
2064 if (DR_IS_READ (dr)
2065 && (dr_step % type_size) == 0
2066 && groupsize > 0
2067 && exact_log2 (groupsize) != -1)
2068 {
2069 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2070 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2071 if (dump_enabled_p ())
2072 {
2073 dump_printf_loc (MSG_NOTE, vect_location,
2074 "Detected single element interleaving ");
2075 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2076 dump_printf (MSG_NOTE, " step ");
2077 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2078 dump_printf (MSG_NOTE, "\n");
2079 }
2080
2081 if (loop_vinfo)
2082 {
2083 if (dump_enabled_p ())
2084 dump_printf_loc (MSG_NOTE, vect_location,
2085 "Data access with gaps requires scalar "
2086 "epilogue loop\n");
2087 if (loop->inner)
2088 {
2089 if (dump_enabled_p ())
2090 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2091 "Peeling for outer loop is not"
2092 " supported\n");
2093 return false;
2094 }
2095
2096 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2097 }
2098
2099 return true;
2100 }
2101
2102 if (dump_enabled_p ())
2103 {
2104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2105 "not consecutive access ");
2106 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2107 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2108 }
2109
2110 if (bb_vinfo)
2111 {
2112 /* Mark the statement as unvectorizable. */
2113 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2114 return true;
2115 }
2116
2117 return false;
2118 }
2119
2120 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2121 {
2122 /* First stmt in the interleaving chain. Check the chain. */
2123 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2124 struct data_reference *data_ref = dr;
2125 unsigned int count = 1;
2126 tree prev_init = DR_INIT (data_ref);
2127 gimple prev = stmt;
2128 HOST_WIDE_INT diff, gaps = 0;
2129
2130 while (next)
2131 {
2132 /* Skip same data-refs. In case that two or more stmts share
2133 data-ref (supported only for loads), we vectorize only the first
2134 stmt, and the rest get their vectorized loads from the first
2135 one. */
2136 if (!tree_int_cst_compare (DR_INIT (data_ref),
2137 DR_INIT (STMT_VINFO_DATA_REF (
2138 vinfo_for_stmt (next)))))
2139 {
2140 if (DR_IS_WRITE (data_ref))
2141 {
2142 if (dump_enabled_p ())
2143 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144 "Two store stmts share the same dr.\n");
2145 return false;
2146 }
2147
2148 /* For load use the same data-ref load. */
2149 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2150
2151 prev = next;
2152 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2153 continue;
2154 }
2155
2156 prev = next;
2157 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2158
2159 /* All group members have the same STEP by construction. */
2160 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2161
2162 /* Check that the distance between two accesses is equal to the type
2163 size. Otherwise, we have gaps. */
2164 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2165 - TREE_INT_CST_LOW (prev_init)) / type_size;
2166 if (diff != 1)
2167 {
2168 /* FORNOW: SLP of accesses with gaps is not supported. */
2169 slp_impossible = true;
2170 if (DR_IS_WRITE (data_ref))
2171 {
2172 if (dump_enabled_p ())
2173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2174 "interleaved store with gaps\n");
2175 return false;
2176 }
2177
2178 gaps += diff - 1;
2179 }
2180
2181 last_accessed_element += diff;
2182
2183 /* Store the gap from the previous member of the group. If there is no
2184 gap in the access, GROUP_GAP is always 1. */
2185 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2186
2187 prev_init = DR_INIT (data_ref);
2188 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2189 /* Count the number of data-refs in the chain. */
2190 count++;
2191 }
2192
2193 if (groupsize == 0)
2194 groupsize = count + gaps;
2195
2196 /* Check that the size of the interleaving is equal to count for stores,
2197 i.e., that there are no gaps. */
2198 if (groupsize != count
2199 && !DR_IS_READ (dr))
2200 {
2201 if (dump_enabled_p ())
2202 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203 "interleaved store with gaps\n");
2204 return false;
2205 }
2206
2207 /* If there is a gap after the last load in the group it is the
2208 difference between the groupsize and the last accessed
2209 element.
2210 When there is no gap, this difference should be 0. */
2211 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2212
2213 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2214 if (dump_enabled_p ())
2215 {
2216 dump_printf_loc (MSG_NOTE, vect_location,
2217 "Detected interleaving of size %d starting with ",
2218 (int)groupsize);
2219 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2220 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2221 dump_printf_loc (MSG_NOTE, vect_location,
2222 "There is a gap of %d elements after the group\n",
2223 (int)GROUP_GAP (vinfo_for_stmt (stmt)));
2224 }
2225
2226 /* SLP: create an SLP data structure for every interleaving group of
2227 stores for further analysis in vect_analyse_slp. */
2228 if (DR_IS_WRITE (dr) && !slp_impossible)
2229 {
2230 if (loop_vinfo)
2231 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2232 if (bb_vinfo)
2233 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2234 }
2235
2236 /* If there is a gap in the end of the group or the group size cannot
2237 be made a multiple of the vector element count then we access excess
2238 elements in the last iteration and thus need to peel that off. */
2239 if (loop_vinfo
2240 && (groupsize - last_accessed_element > 0
2241 || exact_log2 (groupsize) == -1))
2242
2243 {
2244 if (dump_enabled_p ())
2245 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2246 "Data access with gaps requires scalar "
2247 "epilogue loop\n");
2248 if (loop->inner)
2249 {
2250 if (dump_enabled_p ())
2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252 "Peeling for outer loop is not supported\n");
2253 return false;
2254 }
2255
2256 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2257 }
2258 }
2259
2260 return true;
2261 }
2262
2263
2264 /* Analyze the access pattern of the data-reference DR.
2265 In case of non-consecutive accesses call vect_analyze_group_access() to
2266 analyze groups of accesses. */
2267
2268 static bool
2269 vect_analyze_data_ref_access (struct data_reference *dr)
2270 {
2271 tree step = DR_STEP (dr);
2272 tree scalar_type = TREE_TYPE (DR_REF (dr));
2273 gimple stmt = DR_STMT (dr);
2274 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2275 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2276 struct loop *loop = NULL;
2277
2278 if (loop_vinfo)
2279 loop = LOOP_VINFO_LOOP (loop_vinfo);
2280
2281 if (loop_vinfo && !step)
2282 {
2283 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "bad data-ref access in loop\n");
2286 return false;
2287 }
2288
2289 /* Allow loads with zero step in inner-loop vectorization. */
2290 if (loop_vinfo && integer_zerop (step))
2291 {
2292 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2293 if (!nested_in_vect_loop_p (loop, stmt))
2294 return DR_IS_READ (dr);
2295 /* Allow references with zero step for outer loops marked
2296 with pragma omp simd only - it guarantees absence of
2297 loop-carried dependencies between inner loop iterations. */
2298 if (!loop->force_vectorize)
2299 {
2300 if (dump_enabled_p ())
2301 dump_printf_loc (MSG_NOTE, vect_location,
2302 "zero step in inner loop of nest\n");
2303 return false;
2304 }
2305 }
2306
2307 if (loop && nested_in_vect_loop_p (loop, stmt))
2308 {
2309 /* Interleaved accesses are not yet supported within outer-loop
2310 vectorization for references in the inner-loop. */
2311 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2312
2313 /* For the rest of the analysis we use the outer-loop step. */
2314 step = STMT_VINFO_DR_STEP (stmt_info);
2315 if (integer_zerop (step))
2316 {
2317 if (dump_enabled_p ())
2318 dump_printf_loc (MSG_NOTE, vect_location,
2319 "zero step in outer loop.\n");
2320 if (DR_IS_READ (dr))
2321 return true;
2322 else
2323 return false;
2324 }
2325 }
2326
2327 /* Consecutive? */
2328 if (TREE_CODE (step) == INTEGER_CST)
2329 {
2330 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2331 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2332 || (dr_step < 0
2333 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2334 {
2335 /* Mark that it is not interleaving. */
2336 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2337 return true;
2338 }
2339 }
2340
2341 if (loop && nested_in_vect_loop_p (loop, stmt))
2342 {
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_NOTE, vect_location,
2345 "grouped access in outer loop.\n");
2346 return false;
2347 }
2348
2349
2350 /* Assume this is a DR handled by non-constant strided load case. */
2351 if (TREE_CODE (step) != INTEGER_CST)
2352 return (STMT_VINFO_STRIDED_P (stmt_info)
2353 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2354 || vect_analyze_group_access (dr)));
2355
2356 /* Not consecutive access - check if it's a part of interleaving group. */
2357 return vect_analyze_group_access (dr);
2358 }
2359
2360
2361
2362 /* A helper function used in the comparator function to sort data
2363 references. T1 and T2 are two data references to be compared.
2364 The function returns -1, 0, or 1. */
2365
2366 static int
2367 compare_tree (tree t1, tree t2)
2368 {
2369 int i, cmp;
2370 enum tree_code code;
2371 char tclass;
2372
2373 if (t1 == t2)
2374 return 0;
2375 if (t1 == NULL)
2376 return -1;
2377 if (t2 == NULL)
2378 return 1;
2379
2380
2381 if (TREE_CODE (t1) != TREE_CODE (t2))
2382 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2383
2384 code = TREE_CODE (t1);
2385 switch (code)
2386 {
2387 /* For const values, we can just use hash values for comparisons. */
2388 case INTEGER_CST:
2389 case REAL_CST:
2390 case FIXED_CST:
2391 case STRING_CST:
2392 case COMPLEX_CST:
2393 case VECTOR_CST:
2394 {
2395 hashval_t h1 = iterative_hash_expr (t1, 0);
2396 hashval_t h2 = iterative_hash_expr (t2, 0);
2397 if (h1 != h2)
2398 return h1 < h2 ? -1 : 1;
2399 break;
2400 }
2401
2402 case SSA_NAME:
2403 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2404 if (cmp != 0)
2405 return cmp;
2406
2407 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2408 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2409 break;
2410
2411 default:
2412 tclass = TREE_CODE_CLASS (code);
2413
2414 /* For var-decl, we could compare their UIDs. */
2415 if (tclass == tcc_declaration)
2416 {
2417 if (DECL_UID (t1) != DECL_UID (t2))
2418 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2419 break;
2420 }
2421
2422 /* For expressions with operands, compare their operands recursively. */
2423 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2424 {
2425 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2426 if (cmp != 0)
2427 return cmp;
2428 }
2429 }
2430
2431 return 0;
2432 }
2433
2434
2435 /* Compare two data-references DRA and DRB to group them into chunks
2436 suitable for grouping. */
2437
2438 static int
2439 dr_group_sort_cmp (const void *dra_, const void *drb_)
2440 {
2441 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2442 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2443 int cmp;
2444
2445 /* Stabilize sort. */
2446 if (dra == drb)
2447 return 0;
2448
2449 /* Ordering of DRs according to base. */
2450 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2451 {
2452 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2453 if (cmp != 0)
2454 return cmp;
2455 }
2456
2457 /* And according to DR_OFFSET. */
2458 if (!dr_equal_offsets_p (dra, drb))
2459 {
2460 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2461 if (cmp != 0)
2462 return cmp;
2463 }
2464
2465 /* Put reads before writes. */
2466 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2467 return DR_IS_READ (dra) ? -1 : 1;
2468
2469 /* Then sort after access size. */
2470 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2471 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2472 {
2473 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2474 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2475 if (cmp != 0)
2476 return cmp;
2477 }
2478
2479 /* And after step. */
2480 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2481 {
2482 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2483 if (cmp != 0)
2484 return cmp;
2485 }
2486
2487 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2488 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2489 if (cmp == 0)
2490 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2491 return cmp;
2492 }
2493
2494 /* Function vect_analyze_data_ref_accesses.
2495
2496 Analyze the access pattern of all the data references in the loop.
2497
2498 FORNOW: the only access pattern that is considered vectorizable is a
2499 simple step 1 (consecutive) access.
2500
2501 FORNOW: handle only arrays and pointer accesses. */
2502
2503 bool
2504 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2505 {
2506 unsigned int i;
2507 vec<data_reference_p> datarefs;
2508 struct data_reference *dr;
2509
2510 if (dump_enabled_p ())
2511 dump_printf_loc (MSG_NOTE, vect_location,
2512 "=== vect_analyze_data_ref_accesses ===\n");
2513
2514 if (loop_vinfo)
2515 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2516 else
2517 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2518
2519 if (datarefs.is_empty ())
2520 return true;
2521
2522 /* Sort the array of datarefs to make building the interleaving chains
2523 linear. Don't modify the original vector's order, it is needed for
2524 determining what dependencies are reversed. */
2525 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2526 datarefs_copy.qsort (dr_group_sort_cmp);
2527
2528 /* Build the interleaving chains. */
2529 for (i = 0; i < datarefs_copy.length () - 1;)
2530 {
2531 data_reference_p dra = datarefs_copy[i];
2532 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2533 stmt_vec_info lastinfo = NULL;
2534 for (i = i + 1; i < datarefs_copy.length (); ++i)
2535 {
2536 data_reference_p drb = datarefs_copy[i];
2537 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2538
2539 /* ??? Imperfect sorting (non-compatible types, non-modulo
2540 accesses, same accesses) can lead to a group to be artificially
2541 split here as we don't just skip over those. If it really
2542 matters we can push those to a worklist and re-iterate
2543 over them. The we can just skip ahead to the next DR here. */
2544
2545 /* Check that the data-refs have same first location (except init)
2546 and they are both either store or load (not load and store,
2547 not masked loads or stores). */
2548 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2549 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2550 DR_BASE_ADDRESS (drb), 0)
2551 || !dr_equal_offsets_p (dra, drb)
2552 || !gimple_assign_single_p (DR_STMT (dra))
2553 || !gimple_assign_single_p (DR_STMT (drb)))
2554 break;
2555
2556 /* Check that the data-refs have the same constant size. */
2557 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2558 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2559 if (!tree_fits_uhwi_p (sza)
2560 || !tree_fits_uhwi_p (szb)
2561 || !tree_int_cst_equal (sza, szb))
2562 break;
2563
2564 /* Check that the data-refs have the same step. */
2565 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2566 break;
2567
2568 /* Do not place the same access in the interleaving chain twice. */
2569 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2570 break;
2571
2572 /* Check the types are compatible.
2573 ??? We don't distinguish this during sorting. */
2574 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2575 TREE_TYPE (DR_REF (drb))))
2576 break;
2577
2578 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2579 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2580 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2581 gcc_assert (init_a < init_b);
2582
2583 /* If init_b == init_a + the size of the type * k, we have an
2584 interleaving, and DRA is accessed before DRB. */
2585 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2586 if ((init_b - init_a) % type_size_a != 0)
2587 break;
2588
2589 /* If we have a store, the accesses are adjacent. This splits
2590 groups into chunks we support (we don't support vectorization
2591 of stores with gaps). */
2592 if (!DR_IS_READ (dra)
2593 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2594 (DR_INIT (datarefs_copy[i-1]))
2595 != type_size_a))
2596 break;
2597
2598 /* If the step (if not zero or non-constant) is greater than the
2599 difference between data-refs' inits this splits groups into
2600 suitable sizes. */
2601 if (tree_fits_shwi_p (DR_STEP (dra)))
2602 {
2603 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2604 if (step != 0 && step <= (init_b - init_a))
2605 break;
2606 }
2607
2608 if (dump_enabled_p ())
2609 {
2610 dump_printf_loc (MSG_NOTE, vect_location,
2611 "Detected interleaving ");
2612 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2613 dump_printf (MSG_NOTE, " and ");
2614 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2615 dump_printf (MSG_NOTE, "\n");
2616 }
2617
2618 /* Link the found element into the group list. */
2619 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2620 {
2621 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2622 lastinfo = stmtinfo_a;
2623 }
2624 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2625 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2626 lastinfo = stmtinfo_b;
2627 }
2628 }
2629
2630 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2631 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2632 && !vect_analyze_data_ref_access (dr))
2633 {
2634 if (dump_enabled_p ())
2635 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2636 "not vectorized: complicated access pattern.\n");
2637
2638 if (bb_vinfo)
2639 {
2640 /* Mark the statement as not vectorizable. */
2641 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2642 continue;
2643 }
2644 else
2645 {
2646 datarefs_copy.release ();
2647 return false;
2648 }
2649 }
2650
2651 datarefs_copy.release ();
2652 return true;
2653 }
2654
2655
2656 /* Operator == between two dr_with_seg_len objects.
2657
2658 This equality operator is used to make sure two data refs
2659 are the same one so that we will consider to combine the
2660 aliasing checks of those two pairs of data dependent data
2661 refs. */
2662
2663 static bool
2664 operator == (const dr_with_seg_len& d1,
2665 const dr_with_seg_len& d2)
2666 {
2667 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2668 DR_BASE_ADDRESS (d2.dr), 0)
2669 && compare_tree (d1.offset, d2.offset) == 0
2670 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2671 }
2672
2673 /* Function comp_dr_with_seg_len_pair.
2674
2675 Comparison function for sorting objects of dr_with_seg_len_pair_t
2676 so that we can combine aliasing checks in one scan. */
2677
2678 static int
2679 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2680 {
2681 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2682 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2683
2684 const dr_with_seg_len &p11 = p1->first,
2685 &p12 = p1->second,
2686 &p21 = p2->first,
2687 &p22 = p2->second;
2688
2689 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2690 if a and c have the same basic address snd step, and b and d have the same
2691 address and step. Therefore, if any a&c or b&d don't have the same address
2692 and step, we don't care the order of those two pairs after sorting. */
2693 int comp_res;
2694
2695 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2696 DR_BASE_ADDRESS (p21.dr))) != 0)
2697 return comp_res;
2698 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2699 DR_BASE_ADDRESS (p22.dr))) != 0)
2700 return comp_res;
2701 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2702 return comp_res;
2703 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2704 return comp_res;
2705 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2706 return comp_res;
2707 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2708 return comp_res;
2709
2710 return 0;
2711 }
2712
2713 /* Function vect_vfa_segment_size.
2714
2715 Create an expression that computes the size of segment
2716 that will be accessed for a data reference. The functions takes into
2717 account that realignment loads may access one more vector.
2718
2719 Input:
2720 DR: The data reference.
2721 LENGTH_FACTOR: segment length to consider.
2722
2723 Return an expression whose value is the size of segment which will be
2724 accessed by DR. */
2725
2726 static tree
2727 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2728 {
2729 tree segment_length;
2730
2731 if (integer_zerop (DR_STEP (dr)))
2732 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2733 else
2734 segment_length = size_binop (MULT_EXPR,
2735 fold_convert (sizetype, DR_STEP (dr)),
2736 fold_convert (sizetype, length_factor));
2737
2738 if (vect_supportable_dr_alignment (dr, false)
2739 == dr_explicit_realign_optimized)
2740 {
2741 tree vector_size = TYPE_SIZE_UNIT
2742 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2743
2744 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2745 }
2746 return segment_length;
2747 }
2748
2749 /* Function vect_prune_runtime_alias_test_list.
2750
2751 Prune a list of ddrs to be tested at run-time by versioning for alias.
2752 Merge several alias checks into one if possible.
2753 Return FALSE if resulting list of ddrs is longer then allowed by
2754 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2755
2756 bool
2757 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2758 {
2759 vec<ddr_p> may_alias_ddrs =
2760 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2761 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2762 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2763 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2764 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2765
2766 ddr_p ddr;
2767 unsigned int i;
2768 tree length_factor;
2769
2770 if (dump_enabled_p ())
2771 dump_printf_loc (MSG_NOTE, vect_location,
2772 "=== vect_prune_runtime_alias_test_list ===\n");
2773
2774 if (may_alias_ddrs.is_empty ())
2775 return true;
2776
2777 /* Basically, for each pair of dependent data refs store_ptr_0
2778 and load_ptr_0, we create an expression:
2779
2780 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2781 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2782
2783 for aliasing checks. However, in some cases we can decrease
2784 the number of checks by combining two checks into one. For
2785 example, suppose we have another pair of data refs store_ptr_0
2786 and load_ptr_1, and if the following condition is satisfied:
2787
2788 load_ptr_0 < load_ptr_1 &&
2789 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2790
2791 (this condition means, in each iteration of vectorized loop,
2792 the accessed memory of store_ptr_0 cannot be between the memory
2793 of load_ptr_0 and load_ptr_1.)
2794
2795 we then can use only the following expression to finish the
2796 alising checks between store_ptr_0 & load_ptr_0 and
2797 store_ptr_0 & load_ptr_1:
2798
2799 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2800 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2801
2802 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2803 same basic address. */
2804
2805 comp_alias_ddrs.create (may_alias_ddrs.length ());
2806
2807 /* First, we collect all data ref pairs for aliasing checks. */
2808 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2809 {
2810 struct data_reference *dr_a, *dr_b;
2811 gimple dr_group_first_a, dr_group_first_b;
2812 tree segment_length_a, segment_length_b;
2813 gimple stmt_a, stmt_b;
2814
2815 dr_a = DDR_A (ddr);
2816 stmt_a = DR_STMT (DDR_A (ddr));
2817 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2818 if (dr_group_first_a)
2819 {
2820 stmt_a = dr_group_first_a;
2821 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2822 }
2823
2824 dr_b = DDR_B (ddr);
2825 stmt_b = DR_STMT (DDR_B (ddr));
2826 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2827 if (dr_group_first_b)
2828 {
2829 stmt_b = dr_group_first_b;
2830 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2831 }
2832
2833 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2834 length_factor = scalar_loop_iters;
2835 else
2836 length_factor = size_int (vect_factor);
2837 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2838 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2839
2840 dr_with_seg_len_pair_t dr_with_seg_len_pair
2841 (dr_with_seg_len (dr_a, segment_length_a),
2842 dr_with_seg_len (dr_b, segment_length_b));
2843
2844 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2845 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2846
2847 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2848 }
2849
2850 /* Second, we sort the collected data ref pairs so that we can scan
2851 them once to combine all possible aliasing checks. */
2852 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2853
2854 /* Third, we scan the sorted dr pairs and check if we can combine
2855 alias checks of two neighbouring dr pairs. */
2856 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2857 {
2858 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2859 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2860 *dr_b1 = &comp_alias_ddrs[i-1].second,
2861 *dr_a2 = &comp_alias_ddrs[i].first,
2862 *dr_b2 = &comp_alias_ddrs[i].second;
2863
2864 /* Remove duplicate data ref pairs. */
2865 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2866 {
2867 if (dump_enabled_p ())
2868 {
2869 dump_printf_loc (MSG_NOTE, vect_location,
2870 "found equal ranges ");
2871 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2872 DR_REF (dr_a1->dr));
2873 dump_printf (MSG_NOTE, ", ");
2874 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2875 DR_REF (dr_b1->dr));
2876 dump_printf (MSG_NOTE, " and ");
2877 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2878 DR_REF (dr_a2->dr));
2879 dump_printf (MSG_NOTE, ", ");
2880 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2881 DR_REF (dr_b2->dr));
2882 dump_printf (MSG_NOTE, "\n");
2883 }
2884
2885 comp_alias_ddrs.ordered_remove (i--);
2886 continue;
2887 }
2888
2889 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
2890 {
2891 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
2892 and DR_A1 and DR_A2 are two consecutive memrefs. */
2893 if (*dr_a1 == *dr_a2)
2894 {
2895 std::swap (dr_a1, dr_b1);
2896 std::swap (dr_a2, dr_b2);
2897 }
2898
2899 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
2900 DR_BASE_ADDRESS (dr_a2->dr),
2901 0)
2902 || !tree_fits_shwi_p (dr_a1->offset)
2903 || !tree_fits_shwi_p (dr_a2->offset))
2904 continue;
2905
2906 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
2907 - tree_to_shwi (dr_a1->offset));
2908
2909
2910 /* Now we check if the following condition is satisfied:
2911
2912 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
2913
2914 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
2915 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
2916 have to make a best estimation. We can get the minimum value
2917 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
2918 then either of the following two conditions can guarantee the
2919 one above:
2920
2921 1: DIFF <= MIN_SEG_LEN_B
2922 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
2923
2924 */
2925
2926 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
2927 ? tree_to_shwi (dr_b1->seg_len)
2928 : vect_factor);
2929
2930 if (diff <= min_seg_len_b
2931 || (tree_fits_shwi_p (dr_a1->seg_len)
2932 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
2933 {
2934 if (dump_enabled_p ())
2935 {
2936 dump_printf_loc (MSG_NOTE, vect_location,
2937 "merging ranges for ");
2938 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2939 DR_REF (dr_a1->dr));
2940 dump_printf (MSG_NOTE, ", ");
2941 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2942 DR_REF (dr_b1->dr));
2943 dump_printf (MSG_NOTE, " and ");
2944 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2945 DR_REF (dr_a2->dr));
2946 dump_printf (MSG_NOTE, ", ");
2947 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2948 DR_REF (dr_b2->dr));
2949 dump_printf (MSG_NOTE, "\n");
2950 }
2951
2952 dr_a1->seg_len = size_binop (PLUS_EXPR,
2953 dr_a2->seg_len, size_int (diff));
2954 comp_alias_ddrs.ordered_remove (i--);
2955 }
2956 }
2957 }
2958
2959 dump_printf_loc (MSG_NOTE, vect_location,
2960 "improved number of alias checks from %d to %d\n",
2961 may_alias_ddrs.length (), comp_alias_ddrs.length ());
2962 if ((int) comp_alias_ddrs.length () >
2963 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2964 return false;
2965
2966 return true;
2967 }
2968
2969 /* Check whether a non-affine read in stmt is suitable for gather load
2970 and if so, return a builtin decl for that operation. */
2971
2972 tree
2973 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2974 tree *offp, int *scalep)
2975 {
2976 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2977 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2978 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2979 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2980 tree offtype = NULL_TREE;
2981 tree decl, base, off;
2982 machine_mode pmode;
2983 int punsignedp, pvolatilep;
2984
2985 base = DR_REF (dr);
2986 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
2987 see if we can use the def stmt of the address. */
2988 if (is_gimple_call (stmt)
2989 && gimple_call_internal_p (stmt)
2990 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
2991 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
2992 && TREE_CODE (base) == MEM_REF
2993 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
2994 && integer_zerop (TREE_OPERAND (base, 1))
2995 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
2996 {
2997 gimple def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
2998 if (is_gimple_assign (def_stmt)
2999 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3000 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3001 }
3002
3003 /* The gather builtins need address of the form
3004 loop_invariant + vector * {1, 2, 4, 8}
3005 or
3006 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3007 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3008 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3009 multiplications and additions in it. To get a vector, we need
3010 a single SSA_NAME that will be defined in the loop and will
3011 contain everything that is not loop invariant and that can be
3012 vectorized. The following code attempts to find such a preexistng
3013 SSA_NAME OFF and put the loop invariants into a tree BASE
3014 that can be gimplified before the loop. */
3015 base = get_inner_reference (base, &pbitsize, &pbitpos, &off,
3016 &pmode, &punsignedp, &pvolatilep, false);
3017 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
3018
3019 if (TREE_CODE (base) == MEM_REF)
3020 {
3021 if (!integer_zerop (TREE_OPERAND (base, 1)))
3022 {
3023 if (off == NULL_TREE)
3024 {
3025 offset_int moff = mem_ref_offset (base);
3026 off = wide_int_to_tree (sizetype, moff);
3027 }
3028 else
3029 off = size_binop (PLUS_EXPR, off,
3030 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3031 }
3032 base = TREE_OPERAND (base, 0);
3033 }
3034 else
3035 base = build_fold_addr_expr (base);
3036
3037 if (off == NULL_TREE)
3038 off = size_zero_node;
3039
3040 /* If base is not loop invariant, either off is 0, then we start with just
3041 the constant offset in the loop invariant BASE and continue with base
3042 as OFF, otherwise give up.
3043 We could handle that case by gimplifying the addition of base + off
3044 into some SSA_NAME and use that as off, but for now punt. */
3045 if (!expr_invariant_in_loop_p (loop, base))
3046 {
3047 if (!integer_zerop (off))
3048 return NULL_TREE;
3049 off = base;
3050 base = size_int (pbitpos / BITS_PER_UNIT);
3051 }
3052 /* Otherwise put base + constant offset into the loop invariant BASE
3053 and continue with OFF. */
3054 else
3055 {
3056 base = fold_convert (sizetype, base);
3057 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3058 }
3059
3060 /* OFF at this point may be either a SSA_NAME or some tree expression
3061 from get_inner_reference. Try to peel off loop invariants from it
3062 into BASE as long as possible. */
3063 STRIP_NOPS (off);
3064 while (offtype == NULL_TREE)
3065 {
3066 enum tree_code code;
3067 tree op0, op1, add = NULL_TREE;
3068
3069 if (TREE_CODE (off) == SSA_NAME)
3070 {
3071 gimple def_stmt = SSA_NAME_DEF_STMT (off);
3072
3073 if (expr_invariant_in_loop_p (loop, off))
3074 return NULL_TREE;
3075
3076 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3077 break;
3078
3079 op0 = gimple_assign_rhs1 (def_stmt);
3080 code = gimple_assign_rhs_code (def_stmt);
3081 op1 = gimple_assign_rhs2 (def_stmt);
3082 }
3083 else
3084 {
3085 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3086 return NULL_TREE;
3087 code = TREE_CODE (off);
3088 extract_ops_from_tree (off, &code, &op0, &op1);
3089 }
3090 switch (code)
3091 {
3092 case POINTER_PLUS_EXPR:
3093 case PLUS_EXPR:
3094 if (expr_invariant_in_loop_p (loop, op0))
3095 {
3096 add = op0;
3097 off = op1;
3098 do_add:
3099 add = fold_convert (sizetype, add);
3100 if (scale != 1)
3101 add = size_binop (MULT_EXPR, add, size_int (scale));
3102 base = size_binop (PLUS_EXPR, base, add);
3103 continue;
3104 }
3105 if (expr_invariant_in_loop_p (loop, op1))
3106 {
3107 add = op1;
3108 off = op0;
3109 goto do_add;
3110 }
3111 break;
3112 case MINUS_EXPR:
3113 if (expr_invariant_in_loop_p (loop, op1))
3114 {
3115 add = fold_convert (sizetype, op1);
3116 add = size_binop (MINUS_EXPR, size_zero_node, add);
3117 off = op0;
3118 goto do_add;
3119 }
3120 break;
3121 case MULT_EXPR:
3122 if (scale == 1 && tree_fits_shwi_p (op1))
3123 {
3124 scale = tree_to_shwi (op1);
3125 off = op0;
3126 continue;
3127 }
3128 break;
3129 case SSA_NAME:
3130 off = op0;
3131 continue;
3132 CASE_CONVERT:
3133 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3134 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3135 break;
3136 if (TYPE_PRECISION (TREE_TYPE (op0))
3137 == TYPE_PRECISION (TREE_TYPE (off)))
3138 {
3139 off = op0;
3140 continue;
3141 }
3142 if (TYPE_PRECISION (TREE_TYPE (op0))
3143 < TYPE_PRECISION (TREE_TYPE (off)))
3144 {
3145 off = op0;
3146 offtype = TREE_TYPE (off);
3147 STRIP_NOPS (off);
3148 continue;
3149 }
3150 break;
3151 default:
3152 break;
3153 }
3154 break;
3155 }
3156
3157 /* If at the end OFF still isn't a SSA_NAME or isn't
3158 defined in the loop, punt. */
3159 if (TREE_CODE (off) != SSA_NAME
3160 || expr_invariant_in_loop_p (loop, off))
3161 return NULL_TREE;
3162
3163 if (offtype == NULL_TREE)
3164 offtype = TREE_TYPE (off);
3165
3166 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3167 offtype, scale);
3168 if (decl == NULL_TREE)
3169 return NULL_TREE;
3170
3171 if (basep)
3172 *basep = base;
3173 if (offp)
3174 *offp = off;
3175 if (scalep)
3176 *scalep = scale;
3177 return decl;
3178 }
3179
3180 /* Function vect_analyze_data_refs.
3181
3182 Find all the data references in the loop or basic block.
3183
3184 The general structure of the analysis of data refs in the vectorizer is as
3185 follows:
3186 1- vect_analyze_data_refs(loop/bb): call
3187 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3188 in the loop/bb and their dependences.
3189 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3190 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3191 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3192
3193 */
3194
3195 bool
3196 vect_analyze_data_refs (loop_vec_info loop_vinfo,
3197 bb_vec_info bb_vinfo,
3198 int *min_vf, unsigned *n_stmts)
3199 {
3200 struct loop *loop = NULL;
3201 basic_block bb = NULL;
3202 unsigned int i;
3203 vec<data_reference_p> datarefs;
3204 struct data_reference *dr;
3205 tree scalar_type;
3206
3207 if (dump_enabled_p ())
3208 dump_printf_loc (MSG_NOTE, vect_location,
3209 "=== vect_analyze_data_refs ===\n");
3210
3211 if (loop_vinfo)
3212 {
3213 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
3214
3215 loop = LOOP_VINFO_LOOP (loop_vinfo);
3216 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
3217 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
3218 {
3219 if (dump_enabled_p ())
3220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3221 "not vectorized: loop contains function calls"
3222 " or data references that cannot be analyzed\n");
3223 return false;
3224 }
3225
3226 for (i = 0; i < loop->num_nodes; i++)
3227 {
3228 gimple_stmt_iterator gsi;
3229
3230 for (gsi = gsi_start_bb (bbs[i]); !gsi_end_p (gsi); gsi_next (&gsi))
3231 {
3232 gimple stmt = gsi_stmt (gsi);
3233 if (is_gimple_debug (stmt))
3234 continue;
3235 ++*n_stmts;
3236 if (!find_data_references_in_stmt (loop, stmt, &datarefs))
3237 {
3238 if (is_gimple_call (stmt) && loop->safelen)
3239 {
3240 tree fndecl = gimple_call_fndecl (stmt), op;
3241 if (fndecl != NULL_TREE)
3242 {
3243 struct cgraph_node *node = cgraph_node::get (fndecl);
3244 if (node != NULL && node->simd_clones != NULL)
3245 {
3246 unsigned int j, n = gimple_call_num_args (stmt);
3247 for (j = 0; j < n; j++)
3248 {
3249 op = gimple_call_arg (stmt, j);
3250 if (DECL_P (op)
3251 || (REFERENCE_CLASS_P (op)
3252 && get_base_address (op)))
3253 break;
3254 }
3255 op = gimple_call_lhs (stmt);
3256 /* Ignore #pragma omp declare simd functions
3257 if they don't have data references in the
3258 call stmt itself. */
3259 if (j == n
3260 && !(op
3261 && (DECL_P (op)
3262 || (REFERENCE_CLASS_P (op)
3263 && get_base_address (op)))))
3264 continue;
3265 }
3266 }
3267 }
3268 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3269 if (dump_enabled_p ())
3270 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3271 "not vectorized: loop contains function "
3272 "calls or data references that cannot "
3273 "be analyzed\n");
3274 return false;
3275 }
3276 }
3277 }
3278
3279 LOOP_VINFO_DATAREFS (loop_vinfo) = datarefs;
3280 }
3281 else
3282 {
3283 gimple_stmt_iterator gsi;
3284
3285 bb = BB_VINFO_BB (bb_vinfo);
3286 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
3287 {
3288 gimple stmt = gsi_stmt (gsi);
3289 if (is_gimple_debug (stmt))
3290 continue;
3291 ++*n_stmts;
3292 if (!find_data_references_in_stmt (NULL, stmt,
3293 &BB_VINFO_DATAREFS (bb_vinfo)))
3294 {
3295 /* Mark the rest of the basic-block as unvectorizable. */
3296 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3297 {
3298 stmt = gsi_stmt (gsi);
3299 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3300 }
3301 break;
3302 }
3303 }
3304
3305 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3306 }
3307
3308 /* Go through the data-refs, check that the analysis succeeded. Update
3309 pointer from stmt_vec_info struct to DR and vectype. */
3310
3311 FOR_EACH_VEC_ELT (datarefs, i, dr)
3312 {
3313 gimple stmt;
3314 stmt_vec_info stmt_info;
3315 tree base, offset, init;
3316 bool gather = false;
3317 bool simd_lane_access = false;
3318 int vf;
3319
3320 again:
3321 if (!dr || !DR_REF (dr))
3322 {
3323 if (dump_enabled_p ())
3324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3325 "not vectorized: unhandled data-ref\n");
3326 return false;
3327 }
3328
3329 stmt = DR_STMT (dr);
3330 stmt_info = vinfo_for_stmt (stmt);
3331
3332 /* Discard clobbers from the dataref vector. We will remove
3333 clobber stmts during vectorization. */
3334 if (gimple_clobber_p (stmt))
3335 {
3336 free_data_ref (dr);
3337 if (i == datarefs.length () - 1)
3338 {
3339 datarefs.pop ();
3340 break;
3341 }
3342 datarefs.ordered_remove (i);
3343 dr = datarefs[i];
3344 goto again;
3345 }
3346
3347 /* Check that analysis of the data-ref succeeded. */
3348 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3349 || !DR_STEP (dr))
3350 {
3351 bool maybe_gather
3352 = DR_IS_READ (dr)
3353 && !TREE_THIS_VOLATILE (DR_REF (dr))
3354 && targetm.vectorize.builtin_gather != NULL;
3355 bool maybe_simd_lane_access
3356 = loop_vinfo && loop->simduid;
3357
3358 /* If target supports vector gather loads, or if this might be
3359 a SIMD lane access, see if they can't be used. */
3360 if (loop_vinfo
3361 && (maybe_gather || maybe_simd_lane_access)
3362 && !nested_in_vect_loop_p (loop, stmt))
3363 {
3364 struct data_reference *newdr
3365 = create_data_ref (NULL, loop_containing_stmt (stmt),
3366 DR_REF (dr), stmt, true);
3367 gcc_assert (newdr != NULL && DR_REF (newdr));
3368 if (DR_BASE_ADDRESS (newdr)
3369 && DR_OFFSET (newdr)
3370 && DR_INIT (newdr)
3371 && DR_STEP (newdr)
3372 && integer_zerop (DR_STEP (newdr)))
3373 {
3374 if (maybe_simd_lane_access)
3375 {
3376 tree off = DR_OFFSET (newdr);
3377 STRIP_NOPS (off);
3378 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3379 && TREE_CODE (off) == MULT_EXPR
3380 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3381 {
3382 tree step = TREE_OPERAND (off, 1);
3383 off = TREE_OPERAND (off, 0);
3384 STRIP_NOPS (off);
3385 if (CONVERT_EXPR_P (off)
3386 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3387 0)))
3388 < TYPE_PRECISION (TREE_TYPE (off)))
3389 off = TREE_OPERAND (off, 0);
3390 if (TREE_CODE (off) == SSA_NAME)
3391 {
3392 gimple def = SSA_NAME_DEF_STMT (off);
3393 tree reft = TREE_TYPE (DR_REF (newdr));
3394 if (is_gimple_call (def)
3395 && gimple_call_internal_p (def)
3396 && (gimple_call_internal_fn (def)
3397 == IFN_GOMP_SIMD_LANE))
3398 {
3399 tree arg = gimple_call_arg (def, 0);
3400 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3401 arg = SSA_NAME_VAR (arg);
3402 if (arg == loop->simduid
3403 /* For now. */
3404 && tree_int_cst_equal
3405 (TYPE_SIZE_UNIT (reft),
3406 step))
3407 {
3408 DR_OFFSET (newdr) = ssize_int (0);
3409 DR_STEP (newdr) = step;
3410 DR_ALIGNED_TO (newdr)
3411 = size_int (BIGGEST_ALIGNMENT);
3412 dr = newdr;
3413 simd_lane_access = true;
3414 }
3415 }
3416 }
3417 }
3418 }
3419 if (!simd_lane_access && maybe_gather)
3420 {
3421 dr = newdr;
3422 gather = true;
3423 }
3424 }
3425 if (!gather && !simd_lane_access)
3426 free_data_ref (newdr);
3427 }
3428
3429 if (!gather && !simd_lane_access)
3430 {
3431 if (dump_enabled_p ())
3432 {
3433 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3434 "not vectorized: data ref analysis "
3435 "failed ");
3436 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3437 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3438 }
3439
3440 if (bb_vinfo)
3441 break;
3442
3443 return false;
3444 }
3445 }
3446
3447 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3448 {
3449 if (dump_enabled_p ())
3450 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3451 "not vectorized: base addr of dr is a "
3452 "constant\n");
3453
3454 if (bb_vinfo)
3455 break;
3456
3457 if (gather || simd_lane_access)
3458 free_data_ref (dr);
3459 return false;
3460 }
3461
3462 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3463 {
3464 if (dump_enabled_p ())
3465 {
3466 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3467 "not vectorized: volatile type ");
3468 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3469 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3470 }
3471
3472 if (bb_vinfo)
3473 break;
3474
3475 return false;
3476 }
3477
3478 if (stmt_can_throw_internal (stmt))
3479 {
3480 if (dump_enabled_p ())
3481 {
3482 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3483 "not vectorized: statement can throw an "
3484 "exception ");
3485 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3486 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3487 }
3488
3489 if (bb_vinfo)
3490 break;
3491
3492 if (gather || simd_lane_access)
3493 free_data_ref (dr);
3494 return false;
3495 }
3496
3497 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3498 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3499 {
3500 if (dump_enabled_p ())
3501 {
3502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3503 "not vectorized: statement is bitfield "
3504 "access ");
3505 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3507 }
3508
3509 if (bb_vinfo)
3510 break;
3511
3512 if (gather || simd_lane_access)
3513 free_data_ref (dr);
3514 return false;
3515 }
3516
3517 base = unshare_expr (DR_BASE_ADDRESS (dr));
3518 offset = unshare_expr (DR_OFFSET (dr));
3519 init = unshare_expr (DR_INIT (dr));
3520
3521 if (is_gimple_call (stmt)
3522 && (!gimple_call_internal_p (stmt)
3523 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3524 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3525 {
3526 if (dump_enabled_p ())
3527 {
3528 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3529 "not vectorized: dr in a call ");
3530 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3531 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3532 }
3533
3534 if (bb_vinfo)
3535 break;
3536
3537 if (gather || simd_lane_access)
3538 free_data_ref (dr);
3539 return false;
3540 }
3541
3542 /* Update DR field in stmt_vec_info struct. */
3543
3544 /* If the dataref is in an inner-loop of the loop that is considered for
3545 for vectorization, we also want to analyze the access relative to
3546 the outer-loop (DR contains information only relative to the
3547 inner-most enclosing loop). We do that by building a reference to the
3548 first location accessed by the inner-loop, and analyze it relative to
3549 the outer-loop. */
3550 if (loop && nested_in_vect_loop_p (loop, stmt))
3551 {
3552 tree outer_step, outer_base, outer_init;
3553 HOST_WIDE_INT pbitsize, pbitpos;
3554 tree poffset;
3555 machine_mode pmode;
3556 int punsignedp, pvolatilep;
3557 affine_iv base_iv, offset_iv;
3558 tree dinit;
3559
3560 /* Build a reference to the first location accessed by the
3561 inner-loop: *(BASE+INIT). (The first location is actually
3562 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3563 tree inner_base = build_fold_indirect_ref
3564 (fold_build_pointer_plus (base, init));
3565
3566 if (dump_enabled_p ())
3567 {
3568 dump_printf_loc (MSG_NOTE, vect_location,
3569 "analyze in outer-loop: ");
3570 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3571 dump_printf (MSG_NOTE, "\n");
3572 }
3573
3574 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3575 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3576 gcc_assert (outer_base != NULL_TREE);
3577
3578 if (pbitpos % BITS_PER_UNIT != 0)
3579 {
3580 if (dump_enabled_p ())
3581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3582 "failed: bit offset alignment.\n");
3583 return false;
3584 }
3585
3586 outer_base = build_fold_addr_expr (outer_base);
3587 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3588 &base_iv, false))
3589 {
3590 if (dump_enabled_p ())
3591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3592 "failed: evolution of base is not affine.\n");
3593 return false;
3594 }
3595
3596 if (offset)
3597 {
3598 if (poffset)
3599 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3600 poffset);
3601 else
3602 poffset = offset;
3603 }
3604
3605 if (!poffset)
3606 {
3607 offset_iv.base = ssize_int (0);
3608 offset_iv.step = ssize_int (0);
3609 }
3610 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3611 &offset_iv, false))
3612 {
3613 if (dump_enabled_p ())
3614 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615 "evolution of offset is not affine.\n");
3616 return false;
3617 }
3618
3619 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3620 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3621 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3622 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3623 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3624
3625 outer_step = size_binop (PLUS_EXPR,
3626 fold_convert (ssizetype, base_iv.step),
3627 fold_convert (ssizetype, offset_iv.step));
3628
3629 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3630 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3631 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3632 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3633 STMT_VINFO_DR_OFFSET (stmt_info) =
3634 fold_convert (ssizetype, offset_iv.base);
3635 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3636 size_int (highest_pow2_factor (offset_iv.base));
3637
3638 if (dump_enabled_p ())
3639 {
3640 dump_printf_loc (MSG_NOTE, vect_location,
3641 "\touter base_address: ");
3642 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3643 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3644 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3645 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3646 STMT_VINFO_DR_OFFSET (stmt_info));
3647 dump_printf (MSG_NOTE,
3648 "\n\touter constant offset from base address: ");
3649 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3650 STMT_VINFO_DR_INIT (stmt_info));
3651 dump_printf (MSG_NOTE, "\n\touter step: ");
3652 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3653 STMT_VINFO_DR_STEP (stmt_info));
3654 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3655 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3656 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3657 dump_printf (MSG_NOTE, "\n");
3658 }
3659 }
3660
3661 if (STMT_VINFO_DATA_REF (stmt_info))
3662 {
3663 if (dump_enabled_p ())
3664 {
3665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3666 "not vectorized: more than one data ref "
3667 "in stmt: ");
3668 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3669 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3670 }
3671
3672 if (bb_vinfo)
3673 break;
3674
3675 if (gather || simd_lane_access)
3676 free_data_ref (dr);
3677 return false;
3678 }
3679
3680 STMT_VINFO_DATA_REF (stmt_info) = dr;
3681 if (simd_lane_access)
3682 {
3683 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3684 free_data_ref (datarefs[i]);
3685 datarefs[i] = dr;
3686 }
3687
3688 /* Set vectype for STMT. */
3689 scalar_type = TREE_TYPE (DR_REF (dr));
3690 STMT_VINFO_VECTYPE (stmt_info)
3691 = get_vectype_for_scalar_type (scalar_type);
3692 if (!STMT_VINFO_VECTYPE (stmt_info))
3693 {
3694 if (dump_enabled_p ())
3695 {
3696 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3697 "not vectorized: no vectype for stmt: ");
3698 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3699 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3700 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3701 scalar_type);
3702 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3703 }
3704
3705 if (bb_vinfo)
3706 break;
3707
3708 if (gather || simd_lane_access)
3709 {
3710 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3711 if (gather)
3712 free_data_ref (dr);
3713 }
3714 return false;
3715 }
3716 else
3717 {
3718 if (dump_enabled_p ())
3719 {
3720 dump_printf_loc (MSG_NOTE, vect_location,
3721 "got vectype for stmt: ");
3722 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3723 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3724 STMT_VINFO_VECTYPE (stmt_info));
3725 dump_printf (MSG_NOTE, "\n");
3726 }
3727 }
3728
3729 /* Adjust the minimal vectorization factor according to the
3730 vector type. */
3731 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3732 if (vf > *min_vf)
3733 *min_vf = vf;
3734
3735 if (gather)
3736 {
3737 tree off;
3738
3739 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3740 if (gather
3741 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3742 gather = false;
3743 if (!gather)
3744 {
3745 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3746 free_data_ref (dr);
3747 if (dump_enabled_p ())
3748 {
3749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3750 "not vectorized: not suitable for gather "
3751 "load ");
3752 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3753 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3754 }
3755 return false;
3756 }
3757
3758 datarefs[i] = dr;
3759 STMT_VINFO_GATHER_P (stmt_info) = true;
3760 }
3761 else if (loop_vinfo
3762 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3763 {
3764 if (nested_in_vect_loop_p (loop, stmt))
3765 {
3766 if (dump_enabled_p ())
3767 {
3768 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3769 "not vectorized: not suitable for strided "
3770 "load ");
3771 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3772 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3773 }
3774 return false;
3775 }
3776 STMT_VINFO_STRIDED_P (stmt_info) = true;
3777 }
3778 }
3779
3780 /* If we stopped analysis at the first dataref we could not analyze
3781 when trying to vectorize a basic-block mark the rest of the datarefs
3782 as not vectorizable and truncate the vector of datarefs. That
3783 avoids spending useless time in analyzing their dependence. */
3784 if (i != datarefs.length ())
3785 {
3786 gcc_assert (bb_vinfo != NULL);
3787 for (unsigned j = i; j < datarefs.length (); ++j)
3788 {
3789 data_reference_p dr = datarefs[j];
3790 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3791 free_data_ref (dr);
3792 }
3793 datarefs.truncate (i);
3794 }
3795
3796 return true;
3797 }
3798
3799
3800 /* Function vect_get_new_vect_var.
3801
3802 Returns a name for a new variable. The current naming scheme appends the
3803 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3804 the name of vectorizer generated variables, and appends that to NAME if
3805 provided. */
3806
3807 tree
3808 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3809 {
3810 const char *prefix;
3811 tree new_vect_var;
3812
3813 switch (var_kind)
3814 {
3815 case vect_simple_var:
3816 prefix = "vect";
3817 break;
3818 case vect_scalar_var:
3819 prefix = "stmp";
3820 break;
3821 case vect_pointer_var:
3822 prefix = "vectp";
3823 break;
3824 default:
3825 gcc_unreachable ();
3826 }
3827
3828 if (name)
3829 {
3830 char* tmp = concat (prefix, "_", name, NULL);
3831 new_vect_var = create_tmp_reg (type, tmp);
3832 free (tmp);
3833 }
3834 else
3835 new_vect_var = create_tmp_reg (type, prefix);
3836
3837 return new_vect_var;
3838 }
3839
3840 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3841
3842 static void
3843 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3844 stmt_vec_info stmt_info)
3845 {
3846 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3847 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3848 int misalign = DR_MISALIGNMENT (dr);
3849 if (misalign == -1)
3850 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3851 else
3852 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3853 }
3854
3855 /* Function vect_create_addr_base_for_vector_ref.
3856
3857 Create an expression that computes the address of the first memory location
3858 that will be accessed for a data reference.
3859
3860 Input:
3861 STMT: The statement containing the data reference.
3862 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3863 OFFSET: Optional. If supplied, it is be added to the initial address.
3864 LOOP: Specify relative to which loop-nest should the address be computed.
3865 For example, when the dataref is in an inner-loop nested in an
3866 outer-loop that is now being vectorized, LOOP can be either the
3867 outer-loop, or the inner-loop. The first memory location accessed
3868 by the following dataref ('in' points to short):
3869
3870 for (i=0; i<N; i++)
3871 for (j=0; j<M; j++)
3872 s += in[i+j]
3873
3874 is as follows:
3875 if LOOP=i_loop: &in (relative to i_loop)
3876 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3877 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3878 initial address. Unlike OFFSET, which is number of elements to
3879 be added, BYTE_OFFSET is measured in bytes.
3880
3881 Output:
3882 1. Return an SSA_NAME whose value is the address of the memory location of
3883 the first vector of the data reference.
3884 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3885 these statement(s) which define the returned SSA_NAME.
3886
3887 FORNOW: We are only handling array accesses with step 1. */
3888
3889 tree
3890 vect_create_addr_base_for_vector_ref (gimple stmt,
3891 gimple_seq *new_stmt_list,
3892 tree offset,
3893 struct loop *loop,
3894 tree byte_offset)
3895 {
3896 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3897 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3898 tree data_ref_base;
3899 const char *base_name;
3900 tree addr_base;
3901 tree dest;
3902 gimple_seq seq = NULL;
3903 tree base_offset;
3904 tree init;
3905 tree vect_ptr_type;
3906 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3907 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3908
3909 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3910 {
3911 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3912
3913 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3914
3915 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3916 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3917 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3918 }
3919 else
3920 {
3921 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3922 base_offset = unshare_expr (DR_OFFSET (dr));
3923 init = unshare_expr (DR_INIT (dr));
3924 }
3925
3926 if (loop_vinfo)
3927 base_name = get_name (data_ref_base);
3928 else
3929 {
3930 base_offset = ssize_int (0);
3931 init = ssize_int (0);
3932 base_name = get_name (DR_REF (dr));
3933 }
3934
3935 /* Create base_offset */
3936 base_offset = size_binop (PLUS_EXPR,
3937 fold_convert (sizetype, base_offset),
3938 fold_convert (sizetype, init));
3939
3940 if (offset)
3941 {
3942 offset = fold_build2 (MULT_EXPR, sizetype,
3943 fold_convert (sizetype, offset), step);
3944 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3945 base_offset, offset);
3946 }
3947 if (byte_offset)
3948 {
3949 byte_offset = fold_convert (sizetype, byte_offset);
3950 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3951 base_offset, byte_offset);
3952 }
3953
3954 /* base + base_offset */
3955 if (loop_vinfo)
3956 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3957 else
3958 {
3959 addr_base = build1 (ADDR_EXPR,
3960 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3961 unshare_expr (DR_REF (dr)));
3962 }
3963
3964 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3965 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3966 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
3967 gimple_seq_add_seq (new_stmt_list, seq);
3968
3969 if (DR_PTR_INFO (dr)
3970 && TREE_CODE (addr_base) == SSA_NAME
3971 && !SSA_NAME_PTR_INFO (addr_base))
3972 {
3973 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
3974 if (offset || byte_offset)
3975 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3976 }
3977
3978 if (dump_enabled_p ())
3979 {
3980 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3981 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3982 dump_printf (MSG_NOTE, "\n");
3983 }
3984
3985 return addr_base;
3986 }
3987
3988
3989 /* Function vect_create_data_ref_ptr.
3990
3991 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3992 location accessed in the loop by STMT, along with the def-use update
3993 chain to appropriately advance the pointer through the loop iterations.
3994 Also set aliasing information for the pointer. This pointer is used by
3995 the callers to this function to create a memory reference expression for
3996 vector load/store access.
3997
3998 Input:
3999 1. STMT: a stmt that references memory. Expected to be of the form
4000 GIMPLE_ASSIGN <name, data-ref> or
4001 GIMPLE_ASSIGN <data-ref, name>.
4002 2. AGGR_TYPE: the type of the reference, which should be either a vector
4003 or an array.
4004 3. AT_LOOP: the loop where the vector memref is to be created.
4005 4. OFFSET (optional): an offset to be added to the initial address accessed
4006 by the data-ref in STMT.
4007 5. BSI: location where the new stmts are to be placed if there is no loop
4008 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4009 pointing to the initial address.
4010 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4011 to the initial address accessed by the data-ref in STMT. This is
4012 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4013 in bytes.
4014
4015 Output:
4016 1. Declare a new ptr to vector_type, and have it point to the base of the
4017 data reference (initial addressed accessed by the data reference).
4018 For example, for vector of type V8HI, the following code is generated:
4019
4020 v8hi *ap;
4021 ap = (v8hi *)initial_address;
4022
4023 if OFFSET is not supplied:
4024 initial_address = &a[init];
4025 if OFFSET is supplied:
4026 initial_address = &a[init + OFFSET];
4027 if BYTE_OFFSET is supplied:
4028 initial_address = &a[init] + BYTE_OFFSET;
4029
4030 Return the initial_address in INITIAL_ADDRESS.
4031
4032 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4033 update the pointer in each iteration of the loop.
4034
4035 Return the increment stmt that updates the pointer in PTR_INCR.
4036
4037 3. Set INV_P to true if the access pattern of the data reference in the
4038 vectorized loop is invariant. Set it to false otherwise.
4039
4040 4. Return the pointer. */
4041
4042 tree
4043 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
4044 tree offset, tree *initial_address,
4045 gimple_stmt_iterator *gsi, gimple *ptr_incr,
4046 bool only_init, bool *inv_p, tree byte_offset)
4047 {
4048 const char *base_name;
4049 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4050 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4051 struct loop *loop = NULL;
4052 bool nested_in_vect_loop = false;
4053 struct loop *containing_loop = NULL;
4054 tree aggr_ptr_type;
4055 tree aggr_ptr;
4056 tree new_temp;
4057 gimple_seq new_stmt_list = NULL;
4058 edge pe = NULL;
4059 basic_block new_bb;
4060 tree aggr_ptr_init;
4061 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4062 tree aptr;
4063 gimple_stmt_iterator incr_gsi;
4064 bool insert_after;
4065 tree indx_before_incr, indx_after_incr;
4066 gimple incr;
4067 tree step;
4068 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4069
4070 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4071 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4072
4073 if (loop_vinfo)
4074 {
4075 loop = LOOP_VINFO_LOOP (loop_vinfo);
4076 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4077 containing_loop = (gimple_bb (stmt))->loop_father;
4078 pe = loop_preheader_edge (loop);
4079 }
4080 else
4081 {
4082 gcc_assert (bb_vinfo);
4083 only_init = true;
4084 *ptr_incr = NULL;
4085 }
4086
4087 /* Check the step (evolution) of the load in LOOP, and record
4088 whether it's invariant. */
4089 if (nested_in_vect_loop)
4090 step = STMT_VINFO_DR_STEP (stmt_info);
4091 else
4092 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4093
4094 if (integer_zerop (step))
4095 *inv_p = true;
4096 else
4097 *inv_p = false;
4098
4099 /* Create an expression for the first address accessed by this load
4100 in LOOP. */
4101 base_name = get_name (DR_BASE_ADDRESS (dr));
4102
4103 if (dump_enabled_p ())
4104 {
4105 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4106 dump_printf_loc (MSG_NOTE, vect_location,
4107 "create %s-pointer variable to type: ",
4108 get_tree_code_name (TREE_CODE (aggr_type)));
4109 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4110 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4111 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4112 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4113 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4114 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4115 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4116 else
4117 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4118 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4119 dump_printf (MSG_NOTE, "\n");
4120 }
4121
4122 /* (1) Create the new aggregate-pointer variable.
4123 Vector and array types inherit the alias set of their component
4124 type by default so we need to use a ref-all pointer if the data
4125 reference does not conflict with the created aggregated data
4126 reference because it is not addressable. */
4127 bool need_ref_all = false;
4128 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4129 get_alias_set (DR_REF (dr))))
4130 need_ref_all = true;
4131 /* Likewise for any of the data references in the stmt group. */
4132 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4133 {
4134 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4135 do
4136 {
4137 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4138 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4139 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4140 get_alias_set (DR_REF (sdr))))
4141 {
4142 need_ref_all = true;
4143 break;
4144 }
4145 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4146 }
4147 while (orig_stmt);
4148 }
4149 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4150 need_ref_all);
4151 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4152
4153
4154 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4155 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4156 def-use update cycles for the pointer: one relative to the outer-loop
4157 (LOOP), which is what steps (3) and (4) below do. The other is relative
4158 to the inner-loop (which is the inner-most loop containing the dataref),
4159 and this is done be step (5) below.
4160
4161 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4162 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4163 redundant. Steps (3),(4) create the following:
4164
4165 vp0 = &base_addr;
4166 LOOP: vp1 = phi(vp0,vp2)
4167 ...
4168 ...
4169 vp2 = vp1 + step
4170 goto LOOP
4171
4172 If there is an inner-loop nested in loop, then step (5) will also be
4173 applied, and an additional update in the inner-loop will be created:
4174
4175 vp0 = &base_addr;
4176 LOOP: vp1 = phi(vp0,vp2)
4177 ...
4178 inner: vp3 = phi(vp1,vp4)
4179 vp4 = vp3 + inner_step
4180 if () goto inner
4181 ...
4182 vp2 = vp1 + step
4183 if () goto LOOP */
4184
4185 /* (2) Calculate the initial address of the aggregate-pointer, and set
4186 the aggregate-pointer to point to it before the loop. */
4187
4188 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4189
4190 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4191 offset, loop, byte_offset);
4192 if (new_stmt_list)
4193 {
4194 if (pe)
4195 {
4196 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4197 gcc_assert (!new_bb);
4198 }
4199 else
4200 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4201 }
4202
4203 *initial_address = new_temp;
4204 aggr_ptr_init = new_temp;
4205
4206 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4207 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4208 inner-loop nested in LOOP (during outer-loop vectorization). */
4209
4210 /* No update in loop is required. */
4211 if (only_init && (!loop_vinfo || at_loop == loop))
4212 aptr = aggr_ptr_init;
4213 else
4214 {
4215 /* The step of the aggregate pointer is the type size. */
4216 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4217 /* One exception to the above is when the scalar step of the load in
4218 LOOP is zero. In this case the step here is also zero. */
4219 if (*inv_p)
4220 iv_step = size_zero_node;
4221 else if (tree_int_cst_sgn (step) == -1)
4222 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4223
4224 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4225
4226 create_iv (aggr_ptr_init,
4227 fold_convert (aggr_ptr_type, iv_step),
4228 aggr_ptr, loop, &incr_gsi, insert_after,
4229 &indx_before_incr, &indx_after_incr);
4230 incr = gsi_stmt (incr_gsi);
4231 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4232
4233 /* Copy the points-to information if it exists. */
4234 if (DR_PTR_INFO (dr))
4235 {
4236 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4237 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4238 }
4239 if (ptr_incr)
4240 *ptr_incr = incr;
4241
4242 aptr = indx_before_incr;
4243 }
4244
4245 if (!nested_in_vect_loop || only_init)
4246 return aptr;
4247
4248
4249 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4250 nested in LOOP, if exists. */
4251
4252 gcc_assert (nested_in_vect_loop);
4253 if (!only_init)
4254 {
4255 standard_iv_increment_position (containing_loop, &incr_gsi,
4256 &insert_after);
4257 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4258 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4259 &indx_after_incr);
4260 incr = gsi_stmt (incr_gsi);
4261 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
4262
4263 /* Copy the points-to information if it exists. */
4264 if (DR_PTR_INFO (dr))
4265 {
4266 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4267 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4268 }
4269 if (ptr_incr)
4270 *ptr_incr = incr;
4271
4272 return indx_before_incr;
4273 }
4274 else
4275 gcc_unreachable ();
4276 }
4277
4278
4279 /* Function bump_vector_ptr
4280
4281 Increment a pointer (to a vector type) by vector-size. If requested,
4282 i.e. if PTR-INCR is given, then also connect the new increment stmt
4283 to the existing def-use update-chain of the pointer, by modifying
4284 the PTR_INCR as illustrated below:
4285
4286 The pointer def-use update-chain before this function:
4287 DATAREF_PTR = phi (p_0, p_2)
4288 ....
4289 PTR_INCR: p_2 = DATAREF_PTR + step
4290
4291 The pointer def-use update-chain after this function:
4292 DATAREF_PTR = phi (p_0, p_2)
4293 ....
4294 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4295 ....
4296 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4297
4298 Input:
4299 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4300 in the loop.
4301 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4302 the loop. The increment amount across iterations is expected
4303 to be vector_size.
4304 BSI - location where the new update stmt is to be placed.
4305 STMT - the original scalar memory-access stmt that is being vectorized.
4306 BUMP - optional. The offset by which to bump the pointer. If not given,
4307 the offset is assumed to be vector_size.
4308
4309 Output: Return NEW_DATAREF_PTR as illustrated above.
4310
4311 */
4312
4313 tree
4314 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4315 gimple stmt, tree bump)
4316 {
4317 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4318 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4319 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4320 tree update = TYPE_SIZE_UNIT (vectype);
4321 gassign *incr_stmt;
4322 ssa_op_iter iter;
4323 use_operand_p use_p;
4324 tree new_dataref_ptr;
4325
4326 if (bump)
4327 update = bump;
4328
4329 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4330 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4331 else
4332 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4333 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4334 dataref_ptr, update);
4335 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4336
4337 /* Copy the points-to information if it exists. */
4338 if (DR_PTR_INFO (dr))
4339 {
4340 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4341 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4342 }
4343
4344 if (!ptr_incr)
4345 return new_dataref_ptr;
4346
4347 /* Update the vector-pointer's cross-iteration increment. */
4348 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4349 {
4350 tree use = USE_FROM_PTR (use_p);
4351
4352 if (use == dataref_ptr)
4353 SET_USE (use_p, new_dataref_ptr);
4354 else
4355 gcc_assert (tree_int_cst_compare (use, update) == 0);
4356 }
4357
4358 return new_dataref_ptr;
4359 }
4360
4361
4362 /* Function vect_create_destination_var.
4363
4364 Create a new temporary of type VECTYPE. */
4365
4366 tree
4367 vect_create_destination_var (tree scalar_dest, tree vectype)
4368 {
4369 tree vec_dest;
4370 const char *name;
4371 char *new_name;
4372 tree type;
4373 enum vect_var_kind kind;
4374
4375 kind = vectype ? vect_simple_var : vect_scalar_var;
4376 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4377
4378 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4379
4380 name = get_name (scalar_dest);
4381 if (name)
4382 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4383 else
4384 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4385 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4386 free (new_name);
4387
4388 return vec_dest;
4389 }
4390
4391 /* Function vect_grouped_store_supported.
4392
4393 Returns TRUE if interleave high and interleave low permutations
4394 are supported, and FALSE otherwise. */
4395
4396 bool
4397 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4398 {
4399 machine_mode mode = TYPE_MODE (vectype);
4400
4401 /* vect_permute_store_chain requires the group size to be equal to 3 or
4402 be a power of two. */
4403 if (count != 3 && exact_log2 (count) == -1)
4404 {
4405 if (dump_enabled_p ())
4406 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4407 "the size of the group of accesses"
4408 " is not a power of 2 or not eqaul to 3\n");
4409 return false;
4410 }
4411
4412 /* Check that the permutation is supported. */
4413 if (VECTOR_MODE_P (mode))
4414 {
4415 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4416 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4417
4418 if (count == 3)
4419 {
4420 unsigned int j0 = 0, j1 = 0, j2 = 0;
4421 unsigned int i, j;
4422
4423 for (j = 0; j < 3; j++)
4424 {
4425 int nelt0 = ((3 - j) * nelt) % 3;
4426 int nelt1 = ((3 - j) * nelt + 1) % 3;
4427 int nelt2 = ((3 - j) * nelt + 2) % 3;
4428 for (i = 0; i < nelt; i++)
4429 {
4430 if (3 * i + nelt0 < nelt)
4431 sel[3 * i + nelt0] = j0++;
4432 if (3 * i + nelt1 < nelt)
4433 sel[3 * i + nelt1] = nelt + j1++;
4434 if (3 * i + nelt2 < nelt)
4435 sel[3 * i + nelt2] = 0;
4436 }
4437 if (!can_vec_perm_p (mode, false, sel))
4438 {
4439 if (dump_enabled_p ())
4440 dump_printf (MSG_MISSED_OPTIMIZATION,
4441 "permutaion op not supported by target.\n");
4442 return false;
4443 }
4444
4445 for (i = 0; i < nelt; i++)
4446 {
4447 if (3 * i + nelt0 < nelt)
4448 sel[3 * i + nelt0] = 3 * i + nelt0;
4449 if (3 * i + nelt1 < nelt)
4450 sel[3 * i + nelt1] = 3 * i + nelt1;
4451 if (3 * i + nelt2 < nelt)
4452 sel[3 * i + nelt2] = nelt + j2++;
4453 }
4454 if (!can_vec_perm_p (mode, false, sel))
4455 {
4456 if (dump_enabled_p ())
4457 dump_printf (MSG_MISSED_OPTIMIZATION,
4458 "permutaion op not supported by target.\n");
4459 return false;
4460 }
4461 }
4462 return true;
4463 }
4464 else
4465 {
4466 /* If length is not equal to 3 then only power of 2 is supported. */
4467 gcc_assert (exact_log2 (count) != -1);
4468
4469 for (i = 0; i < nelt / 2; i++)
4470 {
4471 sel[i * 2] = i;
4472 sel[i * 2 + 1] = i + nelt;
4473 }
4474 if (can_vec_perm_p (mode, false, sel))
4475 {
4476 for (i = 0; i < nelt; i++)
4477 sel[i] += nelt / 2;
4478 if (can_vec_perm_p (mode, false, sel))
4479 return true;
4480 }
4481 }
4482 }
4483
4484 if (dump_enabled_p ())
4485 dump_printf (MSG_MISSED_OPTIMIZATION,
4486 "permutaion op not supported by target.\n");
4487 return false;
4488 }
4489
4490
4491 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4492 type VECTYPE. */
4493
4494 bool
4495 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4496 {
4497 return vect_lanes_optab_supported_p ("vec_store_lanes",
4498 vec_store_lanes_optab,
4499 vectype, count);
4500 }
4501
4502
4503 /* Function vect_permute_store_chain.
4504
4505 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4506 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4507 the data correctly for the stores. Return the final references for stores
4508 in RESULT_CHAIN.
4509
4510 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4511 The input is 4 vectors each containing 8 elements. We assign a number to
4512 each element, the input sequence is:
4513
4514 1st vec: 0 1 2 3 4 5 6 7
4515 2nd vec: 8 9 10 11 12 13 14 15
4516 3rd vec: 16 17 18 19 20 21 22 23
4517 4th vec: 24 25 26 27 28 29 30 31
4518
4519 The output sequence should be:
4520
4521 1st vec: 0 8 16 24 1 9 17 25
4522 2nd vec: 2 10 18 26 3 11 19 27
4523 3rd vec: 4 12 20 28 5 13 21 30
4524 4th vec: 6 14 22 30 7 15 23 31
4525
4526 i.e., we interleave the contents of the four vectors in their order.
4527
4528 We use interleave_high/low instructions to create such output. The input of
4529 each interleave_high/low operation is two vectors:
4530 1st vec 2nd vec
4531 0 1 2 3 4 5 6 7
4532 the even elements of the result vector are obtained left-to-right from the
4533 high/low elements of the first vector. The odd elements of the result are
4534 obtained left-to-right from the high/low elements of the second vector.
4535 The output of interleave_high will be: 0 4 1 5
4536 and of interleave_low: 2 6 3 7
4537
4538
4539 The permutation is done in log LENGTH stages. In each stage interleave_high
4540 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4541 where the first argument is taken from the first half of DR_CHAIN and the
4542 second argument from it's second half.
4543 In our example,
4544
4545 I1: interleave_high (1st vec, 3rd vec)
4546 I2: interleave_low (1st vec, 3rd vec)
4547 I3: interleave_high (2nd vec, 4th vec)
4548 I4: interleave_low (2nd vec, 4th vec)
4549
4550 The output for the first stage is:
4551
4552 I1: 0 16 1 17 2 18 3 19
4553 I2: 4 20 5 21 6 22 7 23
4554 I3: 8 24 9 25 10 26 11 27
4555 I4: 12 28 13 29 14 30 15 31
4556
4557 The output of the second stage, i.e. the final result is:
4558
4559 I1: 0 8 16 24 1 9 17 25
4560 I2: 2 10 18 26 3 11 19 27
4561 I3: 4 12 20 28 5 13 21 30
4562 I4: 6 14 22 30 7 15 23 31. */
4563
4564 void
4565 vect_permute_store_chain (vec<tree> dr_chain,
4566 unsigned int length,
4567 gimple stmt,
4568 gimple_stmt_iterator *gsi,
4569 vec<tree> *result_chain)
4570 {
4571 tree vect1, vect2, high, low;
4572 gimple perm_stmt;
4573 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4574 tree perm_mask_low, perm_mask_high;
4575 tree data_ref;
4576 tree perm3_mask_low, perm3_mask_high;
4577 unsigned int i, n, log_length = exact_log2 (length);
4578 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4579 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4580
4581 result_chain->quick_grow (length);
4582 memcpy (result_chain->address (), dr_chain.address (),
4583 length * sizeof (tree));
4584
4585 if (length == 3)
4586 {
4587 unsigned int j0 = 0, j1 = 0, j2 = 0;
4588
4589 for (j = 0; j < 3; j++)
4590 {
4591 int nelt0 = ((3 - j) * nelt) % 3;
4592 int nelt1 = ((3 - j) * nelt + 1) % 3;
4593 int nelt2 = ((3 - j) * nelt + 2) % 3;
4594
4595 for (i = 0; i < nelt; i++)
4596 {
4597 if (3 * i + nelt0 < nelt)
4598 sel[3 * i + nelt0] = j0++;
4599 if (3 * i + nelt1 < nelt)
4600 sel[3 * i + nelt1] = nelt + j1++;
4601 if (3 * i + nelt2 < nelt)
4602 sel[3 * i + nelt2] = 0;
4603 }
4604 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4605
4606 for (i = 0; i < nelt; i++)
4607 {
4608 if (3 * i + nelt0 < nelt)
4609 sel[3 * i + nelt0] = 3 * i + nelt0;
4610 if (3 * i + nelt1 < nelt)
4611 sel[3 * i + nelt1] = 3 * i + nelt1;
4612 if (3 * i + nelt2 < nelt)
4613 sel[3 * i + nelt2] = nelt + j2++;
4614 }
4615 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4616
4617 vect1 = dr_chain[0];
4618 vect2 = dr_chain[1];
4619
4620 /* Create interleaving stmt:
4621 low = VEC_PERM_EXPR <vect1, vect2,
4622 {j, nelt, *, j + 1, nelt + j + 1, *,
4623 j + 2, nelt + j + 2, *, ...}> */
4624 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4625 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4626 vect2, perm3_mask_low);
4627 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4628
4629 vect1 = data_ref;
4630 vect2 = dr_chain[2];
4631 /* Create interleaving stmt:
4632 low = VEC_PERM_EXPR <vect1, vect2,
4633 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4634 6, 7, nelt + j + 2, ...}> */
4635 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4636 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4637 vect2, perm3_mask_high);
4638 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4639 (*result_chain)[j] = data_ref;
4640 }
4641 }
4642 else
4643 {
4644 /* If length is not equal to 3 then only power of 2 is supported. */
4645 gcc_assert (exact_log2 (length) != -1);
4646
4647 for (i = 0, n = nelt / 2; i < n; i++)
4648 {
4649 sel[i * 2] = i;
4650 sel[i * 2 + 1] = i + nelt;
4651 }
4652 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4653
4654 for (i = 0; i < nelt; i++)
4655 sel[i] += nelt / 2;
4656 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4657
4658 for (i = 0, n = log_length; i < n; i++)
4659 {
4660 for (j = 0; j < length/2; j++)
4661 {
4662 vect1 = dr_chain[j];
4663 vect2 = dr_chain[j+length/2];
4664
4665 /* Create interleaving stmt:
4666 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4667 ...}> */
4668 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4669 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4670 vect2, perm_mask_high);
4671 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4672 (*result_chain)[2*j] = high;
4673
4674 /* Create interleaving stmt:
4675 low = VEC_PERM_EXPR <vect1, vect2,
4676 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4677 ...}> */
4678 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4679 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4680 vect2, perm_mask_low);
4681 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4682 (*result_chain)[2*j+1] = low;
4683 }
4684 memcpy (dr_chain.address (), result_chain->address (),
4685 length * sizeof (tree));
4686 }
4687 }
4688 }
4689
4690 /* Function vect_setup_realignment
4691
4692 This function is called when vectorizing an unaligned load using
4693 the dr_explicit_realign[_optimized] scheme.
4694 This function generates the following code at the loop prolog:
4695
4696 p = initial_addr;
4697 x msq_init = *(floor(p)); # prolog load
4698 realignment_token = call target_builtin;
4699 loop:
4700 x msq = phi (msq_init, ---)
4701
4702 The stmts marked with x are generated only for the case of
4703 dr_explicit_realign_optimized.
4704
4705 The code above sets up a new (vector) pointer, pointing to the first
4706 location accessed by STMT, and a "floor-aligned" load using that pointer.
4707 It also generates code to compute the "realignment-token" (if the relevant
4708 target hook was defined), and creates a phi-node at the loop-header bb
4709 whose arguments are the result of the prolog-load (created by this
4710 function) and the result of a load that takes place in the loop (to be
4711 created by the caller to this function).
4712
4713 For the case of dr_explicit_realign_optimized:
4714 The caller to this function uses the phi-result (msq) to create the
4715 realignment code inside the loop, and sets up the missing phi argument,
4716 as follows:
4717 loop:
4718 msq = phi (msq_init, lsq)
4719 lsq = *(floor(p')); # load in loop
4720 result = realign_load (msq, lsq, realignment_token);
4721
4722 For the case of dr_explicit_realign:
4723 loop:
4724 msq = *(floor(p)); # load in loop
4725 p' = p + (VS-1);
4726 lsq = *(floor(p')); # load in loop
4727 result = realign_load (msq, lsq, realignment_token);
4728
4729 Input:
4730 STMT - (scalar) load stmt to be vectorized. This load accesses
4731 a memory location that may be unaligned.
4732 BSI - place where new code is to be inserted.
4733 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4734 is used.
4735
4736 Output:
4737 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4738 target hook, if defined.
4739 Return value - the result of the loop-header phi node. */
4740
4741 tree
4742 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4743 tree *realignment_token,
4744 enum dr_alignment_support alignment_support_scheme,
4745 tree init_addr,
4746 struct loop **at_loop)
4747 {
4748 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4749 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4750 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4751 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4752 struct loop *loop = NULL;
4753 edge pe = NULL;
4754 tree scalar_dest = gimple_assign_lhs (stmt);
4755 tree vec_dest;
4756 gimple inc;
4757 tree ptr;
4758 tree data_ref;
4759 basic_block new_bb;
4760 tree msq_init = NULL_TREE;
4761 tree new_temp;
4762 gphi *phi_stmt;
4763 tree msq = NULL_TREE;
4764 gimple_seq stmts = NULL;
4765 bool inv_p;
4766 bool compute_in_loop = false;
4767 bool nested_in_vect_loop = false;
4768 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4769 struct loop *loop_for_initial_load = NULL;
4770
4771 if (loop_vinfo)
4772 {
4773 loop = LOOP_VINFO_LOOP (loop_vinfo);
4774 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4775 }
4776
4777 gcc_assert (alignment_support_scheme == dr_explicit_realign
4778 || alignment_support_scheme == dr_explicit_realign_optimized);
4779
4780 /* We need to generate three things:
4781 1. the misalignment computation
4782 2. the extra vector load (for the optimized realignment scheme).
4783 3. the phi node for the two vectors from which the realignment is
4784 done (for the optimized realignment scheme). */
4785
4786 /* 1. Determine where to generate the misalignment computation.
4787
4788 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4789 calculation will be generated by this function, outside the loop (in the
4790 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4791 caller, inside the loop.
4792
4793 Background: If the misalignment remains fixed throughout the iterations of
4794 the loop, then both realignment schemes are applicable, and also the
4795 misalignment computation can be done outside LOOP. This is because we are
4796 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4797 are a multiple of VS (the Vector Size), and therefore the misalignment in
4798 different vectorized LOOP iterations is always the same.
4799 The problem arises only if the memory access is in an inner-loop nested
4800 inside LOOP, which is now being vectorized using outer-loop vectorization.
4801 This is the only case when the misalignment of the memory access may not
4802 remain fixed throughout the iterations of the inner-loop (as explained in
4803 detail in vect_supportable_dr_alignment). In this case, not only is the
4804 optimized realignment scheme not applicable, but also the misalignment
4805 computation (and generation of the realignment token that is passed to
4806 REALIGN_LOAD) have to be done inside the loop.
4807
4808 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4809 or not, which in turn determines if the misalignment is computed inside
4810 the inner-loop, or outside LOOP. */
4811
4812 if (init_addr != NULL_TREE || !loop_vinfo)
4813 {
4814 compute_in_loop = true;
4815 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4816 }
4817
4818
4819 /* 2. Determine where to generate the extra vector load.
4820
4821 For the optimized realignment scheme, instead of generating two vector
4822 loads in each iteration, we generate a single extra vector load in the
4823 preheader of the loop, and in each iteration reuse the result of the
4824 vector load from the previous iteration. In case the memory access is in
4825 an inner-loop nested inside LOOP, which is now being vectorized using
4826 outer-loop vectorization, we need to determine whether this initial vector
4827 load should be generated at the preheader of the inner-loop, or can be
4828 generated at the preheader of LOOP. If the memory access has no evolution
4829 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4830 to be generated inside LOOP (in the preheader of the inner-loop). */
4831
4832 if (nested_in_vect_loop)
4833 {
4834 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4835 bool invariant_in_outerloop =
4836 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4837 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4838 }
4839 else
4840 loop_for_initial_load = loop;
4841 if (at_loop)
4842 *at_loop = loop_for_initial_load;
4843
4844 if (loop_for_initial_load)
4845 pe = loop_preheader_edge (loop_for_initial_load);
4846
4847 /* 3. For the case of the optimized realignment, create the first vector
4848 load at the loop preheader. */
4849
4850 if (alignment_support_scheme == dr_explicit_realign_optimized)
4851 {
4852 /* Create msq_init = *(floor(p1)) in the loop preheader */
4853 gassign *new_stmt;
4854
4855 gcc_assert (!compute_in_loop);
4856 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4857 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4858 NULL_TREE, &init_addr, NULL, &inc,
4859 true, &inv_p);
4860 new_temp = copy_ssa_name (ptr);
4861 new_stmt = gimple_build_assign
4862 (new_temp, BIT_AND_EXPR, ptr,
4863 build_int_cst (TREE_TYPE (ptr),
4864 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4865 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4866 gcc_assert (!new_bb);
4867 data_ref
4868 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4869 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4870 new_stmt = gimple_build_assign (vec_dest, data_ref);
4871 new_temp = make_ssa_name (vec_dest, new_stmt);
4872 gimple_assign_set_lhs (new_stmt, new_temp);
4873 if (pe)
4874 {
4875 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4876 gcc_assert (!new_bb);
4877 }
4878 else
4879 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4880
4881 msq_init = gimple_assign_lhs (new_stmt);
4882 }
4883
4884 /* 4. Create realignment token using a target builtin, if available.
4885 It is done either inside the containing loop, or before LOOP (as
4886 determined above). */
4887
4888 if (targetm.vectorize.builtin_mask_for_load)
4889 {
4890 gcall *new_stmt;
4891 tree builtin_decl;
4892
4893 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4894 if (!init_addr)
4895 {
4896 /* Generate the INIT_ADDR computation outside LOOP. */
4897 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4898 NULL_TREE, loop);
4899 if (loop)
4900 {
4901 pe = loop_preheader_edge (loop);
4902 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4903 gcc_assert (!new_bb);
4904 }
4905 else
4906 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4907 }
4908
4909 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4910 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4911 vec_dest =
4912 vect_create_destination_var (scalar_dest,
4913 gimple_call_return_type (new_stmt));
4914 new_temp = make_ssa_name (vec_dest, new_stmt);
4915 gimple_call_set_lhs (new_stmt, new_temp);
4916
4917 if (compute_in_loop)
4918 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4919 else
4920 {
4921 /* Generate the misalignment computation outside LOOP. */
4922 pe = loop_preheader_edge (loop);
4923 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4924 gcc_assert (!new_bb);
4925 }
4926
4927 *realignment_token = gimple_call_lhs (new_stmt);
4928
4929 /* The result of the CALL_EXPR to this builtin is determined from
4930 the value of the parameter and no global variables are touched
4931 which makes the builtin a "const" function. Requiring the
4932 builtin to have the "const" attribute makes it unnecessary
4933 to call mark_call_clobbered. */
4934 gcc_assert (TREE_READONLY (builtin_decl));
4935 }
4936
4937 if (alignment_support_scheme == dr_explicit_realign)
4938 return msq;
4939
4940 gcc_assert (!compute_in_loop);
4941 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4942
4943
4944 /* 5. Create msq = phi <msq_init, lsq> in loop */
4945
4946 pe = loop_preheader_edge (containing_loop);
4947 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4948 msq = make_ssa_name (vec_dest);
4949 phi_stmt = create_phi_node (msq, containing_loop->header);
4950 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4951
4952 return msq;
4953 }
4954
4955
4956 /* Function vect_grouped_load_supported.
4957
4958 Returns TRUE if even and odd permutations are supported,
4959 and FALSE otherwise. */
4960
4961 bool
4962 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4963 {
4964 machine_mode mode = TYPE_MODE (vectype);
4965
4966 /* vect_permute_load_chain requires the group size to be equal to 3 or
4967 be a power of two. */
4968 if (count != 3 && exact_log2 (count) == -1)
4969 {
4970 if (dump_enabled_p ())
4971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4972 "the size of the group of accesses"
4973 " is not a power of 2 or not equal to 3\n");
4974 return false;
4975 }
4976
4977 /* Check that the permutation is supported. */
4978 if (VECTOR_MODE_P (mode))
4979 {
4980 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
4981 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4982
4983 if (count == 3)
4984 {
4985 unsigned int k;
4986 for (k = 0; k < 3; k++)
4987 {
4988 for (i = 0; i < nelt; i++)
4989 if (3 * i + k < 2 * nelt)
4990 sel[i] = 3 * i + k;
4991 else
4992 sel[i] = 0;
4993 if (!can_vec_perm_p (mode, false, sel))
4994 {
4995 if (dump_enabled_p ())
4996 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4997 "shuffle of 3 loads is not supported by"
4998 " target\n");
4999 return false;
5000 }
5001 for (i = 0, j = 0; i < nelt; i++)
5002 if (3 * i + k < 2 * nelt)
5003 sel[i] = i;
5004 else
5005 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5006 if (!can_vec_perm_p (mode, false, sel))
5007 {
5008 if (dump_enabled_p ())
5009 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010 "shuffle of 3 loads is not supported by"
5011 " target\n");
5012 return false;
5013 }
5014 }
5015 return true;
5016 }
5017 else
5018 {
5019 /* If length is not equal to 3 then only power of 2 is supported. */
5020 gcc_assert (exact_log2 (count) != -1);
5021 for (i = 0; i < nelt; i++)
5022 sel[i] = i * 2;
5023 if (can_vec_perm_p (mode, false, sel))
5024 {
5025 for (i = 0; i < nelt; i++)
5026 sel[i] = i * 2 + 1;
5027 if (can_vec_perm_p (mode, false, sel))
5028 return true;
5029 }
5030 }
5031 }
5032
5033 if (dump_enabled_p ())
5034 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5035 "extract even/odd not supported by target\n");
5036 return false;
5037 }
5038
5039 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5040 type VECTYPE. */
5041
5042 bool
5043 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5044 {
5045 return vect_lanes_optab_supported_p ("vec_load_lanes",
5046 vec_load_lanes_optab,
5047 vectype, count);
5048 }
5049
5050 /* Function vect_permute_load_chain.
5051
5052 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5053 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5054 the input data correctly. Return the final references for loads in
5055 RESULT_CHAIN.
5056
5057 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5058 The input is 4 vectors each containing 8 elements. We assign a number to each
5059 element, the input sequence is:
5060
5061 1st vec: 0 1 2 3 4 5 6 7
5062 2nd vec: 8 9 10 11 12 13 14 15
5063 3rd vec: 16 17 18 19 20 21 22 23
5064 4th vec: 24 25 26 27 28 29 30 31
5065
5066 The output sequence should be:
5067
5068 1st vec: 0 4 8 12 16 20 24 28
5069 2nd vec: 1 5 9 13 17 21 25 29
5070 3rd vec: 2 6 10 14 18 22 26 30
5071 4th vec: 3 7 11 15 19 23 27 31
5072
5073 i.e., the first output vector should contain the first elements of each
5074 interleaving group, etc.
5075
5076 We use extract_even/odd instructions to create such output. The input of
5077 each extract_even/odd operation is two vectors
5078 1st vec 2nd vec
5079 0 1 2 3 4 5 6 7
5080
5081 and the output is the vector of extracted even/odd elements. The output of
5082 extract_even will be: 0 2 4 6
5083 and of extract_odd: 1 3 5 7
5084
5085
5086 The permutation is done in log LENGTH stages. In each stage extract_even
5087 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5088 their order. In our example,
5089
5090 E1: extract_even (1st vec, 2nd vec)
5091 E2: extract_odd (1st vec, 2nd vec)
5092 E3: extract_even (3rd vec, 4th vec)
5093 E4: extract_odd (3rd vec, 4th vec)
5094
5095 The output for the first stage will be:
5096
5097 E1: 0 2 4 6 8 10 12 14
5098 E2: 1 3 5 7 9 11 13 15
5099 E3: 16 18 20 22 24 26 28 30
5100 E4: 17 19 21 23 25 27 29 31
5101
5102 In order to proceed and create the correct sequence for the next stage (or
5103 for the correct output, if the second stage is the last one, as in our
5104 example), we first put the output of extract_even operation and then the
5105 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5106 The input for the second stage is:
5107
5108 1st vec (E1): 0 2 4 6 8 10 12 14
5109 2nd vec (E3): 16 18 20 22 24 26 28 30
5110 3rd vec (E2): 1 3 5 7 9 11 13 15
5111 4th vec (E4): 17 19 21 23 25 27 29 31
5112
5113 The output of the second stage:
5114
5115 E1: 0 4 8 12 16 20 24 28
5116 E2: 2 6 10 14 18 22 26 30
5117 E3: 1 5 9 13 17 21 25 29
5118 E4: 3 7 11 15 19 23 27 31
5119
5120 And RESULT_CHAIN after reordering:
5121
5122 1st vec (E1): 0 4 8 12 16 20 24 28
5123 2nd vec (E3): 1 5 9 13 17 21 25 29
5124 3rd vec (E2): 2 6 10 14 18 22 26 30
5125 4th vec (E4): 3 7 11 15 19 23 27 31. */
5126
5127 static void
5128 vect_permute_load_chain (vec<tree> dr_chain,
5129 unsigned int length,
5130 gimple stmt,
5131 gimple_stmt_iterator *gsi,
5132 vec<tree> *result_chain)
5133 {
5134 tree data_ref, first_vect, second_vect;
5135 tree perm_mask_even, perm_mask_odd;
5136 tree perm3_mask_low, perm3_mask_high;
5137 gimple perm_stmt;
5138 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5139 unsigned int i, j, log_length = exact_log2 (length);
5140 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5141 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5142
5143 result_chain->quick_grow (length);
5144 memcpy (result_chain->address (), dr_chain.address (),
5145 length * sizeof (tree));
5146
5147 if (length == 3)
5148 {
5149 unsigned int k;
5150
5151 for (k = 0; k < 3; k++)
5152 {
5153 for (i = 0; i < nelt; i++)
5154 if (3 * i + k < 2 * nelt)
5155 sel[i] = 3 * i + k;
5156 else
5157 sel[i] = 0;
5158 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5159
5160 for (i = 0, j = 0; i < nelt; i++)
5161 if (3 * i + k < 2 * nelt)
5162 sel[i] = i;
5163 else
5164 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5165
5166 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5167
5168 first_vect = dr_chain[0];
5169 second_vect = dr_chain[1];
5170
5171 /* Create interleaving stmt (low part of):
5172 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5173 ...}> */
5174 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5175 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5176 second_vect, perm3_mask_low);
5177 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5178
5179 /* Create interleaving stmt (high part of):
5180 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5181 ...}> */
5182 first_vect = data_ref;
5183 second_vect = dr_chain[2];
5184 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5185 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5186 second_vect, perm3_mask_high);
5187 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5188 (*result_chain)[k] = data_ref;
5189 }
5190 }
5191 else
5192 {
5193 /* If length is not equal to 3 then only power of 2 is supported. */
5194 gcc_assert (exact_log2 (length) != -1);
5195
5196 for (i = 0; i < nelt; ++i)
5197 sel[i] = i * 2;
5198 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5199
5200 for (i = 0; i < nelt; ++i)
5201 sel[i] = i * 2 + 1;
5202 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5203
5204 for (i = 0; i < log_length; i++)
5205 {
5206 for (j = 0; j < length; j += 2)
5207 {
5208 first_vect = dr_chain[j];
5209 second_vect = dr_chain[j+1];
5210
5211 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5212 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5213 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5214 first_vect, second_vect,
5215 perm_mask_even);
5216 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5217 (*result_chain)[j/2] = data_ref;
5218
5219 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5220 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5221 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5222 first_vect, second_vect,
5223 perm_mask_odd);
5224 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5225 (*result_chain)[j/2+length/2] = data_ref;
5226 }
5227 memcpy (dr_chain.address (), result_chain->address (),
5228 length * sizeof (tree));
5229 }
5230 }
5231 }
5232
5233 /* Function vect_shift_permute_load_chain.
5234
5235 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5236 sequence of stmts to reorder the input data accordingly.
5237 Return the final references for loads in RESULT_CHAIN.
5238 Return true if successed, false otherwise.
5239
5240 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5241 The input is 3 vectors each containing 8 elements. We assign a
5242 number to each element, the input sequence is:
5243
5244 1st vec: 0 1 2 3 4 5 6 7
5245 2nd vec: 8 9 10 11 12 13 14 15
5246 3rd vec: 16 17 18 19 20 21 22 23
5247
5248 The output sequence should be:
5249
5250 1st vec: 0 3 6 9 12 15 18 21
5251 2nd vec: 1 4 7 10 13 16 19 22
5252 3rd vec: 2 5 8 11 14 17 20 23
5253
5254 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5255
5256 First we shuffle all 3 vectors to get correct elements order:
5257
5258 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5259 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5260 3rd vec: (16 19 22) (17 20 23) (18 21)
5261
5262 Next we unite and shift vector 3 times:
5263
5264 1st step:
5265 shift right by 6 the concatenation of:
5266 "1st vec" and "2nd vec"
5267 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5268 "2nd vec" and "3rd vec"
5269 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5270 "3rd vec" and "1st vec"
5271 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5272 | New vectors |
5273
5274 So that now new vectors are:
5275
5276 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5277 2nd vec: (10 13) (16 19 22) (17 20 23)
5278 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5279
5280 2nd step:
5281 shift right by 5 the concatenation of:
5282 "1st vec" and "3rd vec"
5283 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5284 "2nd vec" and "1st vec"
5285 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5286 "3rd vec" and "2nd vec"
5287 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5288 | New vectors |
5289
5290 So that now new vectors are:
5291
5292 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5293 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5294 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5295
5296 3rd step:
5297 shift right by 5 the concatenation of:
5298 "1st vec" and "1st vec"
5299 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5300 shift right by 3 the concatenation of:
5301 "2nd vec" and "2nd vec"
5302 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5303 | New vectors |
5304
5305 So that now all vectors are READY:
5306 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5307 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5308 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5309
5310 This algorithm is faster than one in vect_permute_load_chain if:
5311 1. "shift of a concatination" is faster than general permutation.
5312 This is usually so.
5313 2. The TARGET machine can't execute vector instructions in parallel.
5314 This is because each step of the algorithm depends on previous.
5315 The algorithm in vect_permute_load_chain is much more parallel.
5316
5317 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5318 */
5319
5320 static bool
5321 vect_shift_permute_load_chain (vec<tree> dr_chain,
5322 unsigned int length,
5323 gimple stmt,
5324 gimple_stmt_iterator *gsi,
5325 vec<tree> *result_chain)
5326 {
5327 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5328 tree perm2_mask1, perm2_mask2, perm3_mask;
5329 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5330 gimple perm_stmt;
5331
5332 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5333 unsigned int i;
5334 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5335 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5336 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5337 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5338
5339 result_chain->quick_grow (length);
5340 memcpy (result_chain->address (), dr_chain.address (),
5341 length * sizeof (tree));
5342
5343 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5344 {
5345 unsigned int j, log_length = exact_log2 (length);
5346 for (i = 0; i < nelt / 2; ++i)
5347 sel[i] = i * 2;
5348 for (i = 0; i < nelt / 2; ++i)
5349 sel[nelt / 2 + i] = i * 2 + 1;
5350 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5351 {
5352 if (dump_enabled_p ())
5353 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5354 "shuffle of 2 fields structure is not \
5355 supported by target\n");
5356 return false;
5357 }
5358 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5359
5360 for (i = 0; i < nelt / 2; ++i)
5361 sel[i] = i * 2 + 1;
5362 for (i = 0; i < nelt / 2; ++i)
5363 sel[nelt / 2 + i] = i * 2;
5364 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5365 {
5366 if (dump_enabled_p ())
5367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5368 "shuffle of 2 fields structure is not \
5369 supported by target\n");
5370 return false;
5371 }
5372 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5373
5374 /* Generating permutation constant to shift all elements.
5375 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5376 for (i = 0; i < nelt; i++)
5377 sel[i] = nelt / 2 + i;
5378 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5379 {
5380 if (dump_enabled_p ())
5381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5382 "shift permutation is not supported by target\n");
5383 return false;
5384 }
5385 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5386
5387 /* Generating permutation constant to select vector from 2.
5388 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5389 for (i = 0; i < nelt / 2; i++)
5390 sel[i] = i;
5391 for (i = nelt / 2; i < nelt; i++)
5392 sel[i] = nelt + i;
5393 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5394 {
5395 if (dump_enabled_p ())
5396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5397 "select is not supported by target\n");
5398 return false;
5399 }
5400 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5401
5402 for (i = 0; i < log_length; i++)
5403 {
5404 for (j = 0; j < length; j += 2)
5405 {
5406 first_vect = dr_chain[j];
5407 second_vect = dr_chain[j + 1];
5408
5409 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5410 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5411 first_vect, first_vect,
5412 perm2_mask1);
5413 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5414 vect[0] = data_ref;
5415
5416 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5417 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5418 second_vect, second_vect,
5419 perm2_mask2);
5420 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5421 vect[1] = data_ref;
5422
5423 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5424 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5425 vect[0], vect[1], shift1_mask);
5426 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5427 (*result_chain)[j/2 + length/2] = data_ref;
5428
5429 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5430 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5431 vect[0], vect[1], select_mask);
5432 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5433 (*result_chain)[j/2] = data_ref;
5434 }
5435 memcpy (dr_chain.address (), result_chain->address (),
5436 length * sizeof (tree));
5437 }
5438 return true;
5439 }
5440 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5441 {
5442 unsigned int k = 0, l = 0;
5443
5444 /* Generating permutation constant to get all elements in rigth order.
5445 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5446 for (i = 0; i < nelt; i++)
5447 {
5448 if (3 * k + (l % 3) >= nelt)
5449 {
5450 k = 0;
5451 l += (3 - (nelt % 3));
5452 }
5453 sel[i] = 3 * k + (l % 3);
5454 k++;
5455 }
5456 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5457 {
5458 if (dump_enabled_p ())
5459 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5460 "shuffle of 3 fields structure is not \
5461 supported by target\n");
5462 return false;
5463 }
5464 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5465
5466 /* Generating permutation constant to shift all elements.
5467 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5468 for (i = 0; i < nelt; i++)
5469 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5470 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5471 {
5472 if (dump_enabled_p ())
5473 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5474 "shift permutation is not supported by target\n");
5475 return false;
5476 }
5477 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5478
5479 /* Generating permutation constant to shift all elements.
5480 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5481 for (i = 0; i < nelt; i++)
5482 sel[i] = 2 * (nelt / 3) + 1 + i;
5483 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5484 {
5485 if (dump_enabled_p ())
5486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5487 "shift permutation is not supported by target\n");
5488 return false;
5489 }
5490 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5491
5492 /* Generating permutation constant to shift all elements.
5493 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5494 for (i = 0; i < nelt; i++)
5495 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5496 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5497 {
5498 if (dump_enabled_p ())
5499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5500 "shift permutation is not supported by target\n");
5501 return false;
5502 }
5503 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5504
5505 /* Generating permutation constant to shift all elements.
5506 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5507 for (i = 0; i < nelt; i++)
5508 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5509 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5510 {
5511 if (dump_enabled_p ())
5512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5513 "shift permutation is not supported by target\n");
5514 return false;
5515 }
5516 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5517
5518 for (k = 0; k < 3; k++)
5519 {
5520 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5521 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5522 dr_chain[k], dr_chain[k],
5523 perm3_mask);
5524 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5525 vect[k] = data_ref;
5526 }
5527
5528 for (k = 0; k < 3; k++)
5529 {
5530 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5531 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5532 vect[k % 3], vect[(k + 1) % 3],
5533 shift1_mask);
5534 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5535 vect_shift[k] = data_ref;
5536 }
5537
5538 for (k = 0; k < 3; k++)
5539 {
5540 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5541 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5542 vect_shift[(4 - k) % 3],
5543 vect_shift[(3 - k) % 3],
5544 shift2_mask);
5545 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5546 vect[k] = data_ref;
5547 }
5548
5549 (*result_chain)[3 - (nelt % 3)] = vect[2];
5550
5551 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5552 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5553 vect[0], shift3_mask);
5554 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5555 (*result_chain)[nelt % 3] = data_ref;
5556
5557 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5558 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5559 vect[1], shift4_mask);
5560 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5561 (*result_chain)[0] = data_ref;
5562 return true;
5563 }
5564 return false;
5565 }
5566
5567 /* Function vect_transform_grouped_load.
5568
5569 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5570 to perform their permutation and ascribe the result vectorized statements to
5571 the scalar statements.
5572 */
5573
5574 void
5575 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
5576 gimple_stmt_iterator *gsi)
5577 {
5578 machine_mode mode;
5579 vec<tree> result_chain = vNULL;
5580
5581 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5582 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5583 vectors, that are ready for vector computation. */
5584 result_chain.create (size);
5585
5586 /* If reassociation width for vector type is 2 or greater target machine can
5587 execute 2 or more vector instructions in parallel. Otherwise try to
5588 get chain for loads group using vect_shift_permute_load_chain. */
5589 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5590 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5591 || exact_log2 (size) != -1
5592 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5593 gsi, &result_chain))
5594 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5595 vect_record_grouped_load_vectors (stmt, result_chain);
5596 result_chain.release ();
5597 }
5598
5599 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5600 generated as part of the vectorization of STMT. Assign the statement
5601 for each vector to the associated scalar statement. */
5602
5603 void
5604 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
5605 {
5606 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5607 gimple next_stmt, new_stmt;
5608 unsigned int i, gap_count;
5609 tree tmp_data_ref;
5610
5611 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5612 Since we scan the chain starting from it's first node, their order
5613 corresponds the order of data-refs in RESULT_CHAIN. */
5614 next_stmt = first_stmt;
5615 gap_count = 1;
5616 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5617 {
5618 if (!next_stmt)
5619 break;
5620
5621 /* Skip the gaps. Loads created for the gaps will be removed by dead
5622 code elimination pass later. No need to check for the first stmt in
5623 the group, since it always exists.
5624 GROUP_GAP is the number of steps in elements from the previous
5625 access (if there is no gap GROUP_GAP is 1). We skip loads that
5626 correspond to the gaps. */
5627 if (next_stmt != first_stmt
5628 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5629 {
5630 gap_count++;
5631 continue;
5632 }
5633
5634 while (next_stmt)
5635 {
5636 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5637 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5638 copies, and we put the new vector statement in the first available
5639 RELATED_STMT. */
5640 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5641 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5642 else
5643 {
5644 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5645 {
5646 gimple prev_stmt =
5647 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5648 gimple rel_stmt =
5649 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5650 while (rel_stmt)
5651 {
5652 prev_stmt = rel_stmt;
5653 rel_stmt =
5654 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5655 }
5656
5657 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5658 new_stmt;
5659 }
5660 }
5661
5662 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5663 gap_count = 1;
5664 /* If NEXT_STMT accesses the same DR as the previous statement,
5665 put the same TMP_DATA_REF as its vectorized statement; otherwise
5666 get the next data-ref from RESULT_CHAIN. */
5667 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5668 break;
5669 }
5670 }
5671 }
5672
5673 /* Function vect_force_dr_alignment_p.
5674
5675 Returns whether the alignment of a DECL can be forced to be aligned
5676 on ALIGNMENT bit boundary. */
5677
5678 bool
5679 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5680 {
5681 if (TREE_CODE (decl) != VAR_DECL)
5682 return false;
5683
5684 if (decl_in_symtab_p (decl)
5685 && !symtab_node::get (decl)->can_increase_alignment_p ())
5686 return false;
5687
5688 if (TREE_STATIC (decl))
5689 return (alignment <= MAX_OFILE_ALIGNMENT);
5690 else
5691 return (alignment <= MAX_STACK_ALIGNMENT);
5692 }
5693
5694
5695 /* Return whether the data reference DR is supported with respect to its
5696 alignment.
5697 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5698 it is aligned, i.e., check if it is possible to vectorize it with different
5699 alignment. */
5700
5701 enum dr_alignment_support
5702 vect_supportable_dr_alignment (struct data_reference *dr,
5703 bool check_aligned_accesses)
5704 {
5705 gimple stmt = DR_STMT (dr);
5706 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5707 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5708 machine_mode mode = TYPE_MODE (vectype);
5709 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5710 struct loop *vect_loop = NULL;
5711 bool nested_in_vect_loop = false;
5712
5713 if (aligned_access_p (dr) && !check_aligned_accesses)
5714 return dr_aligned;
5715
5716 /* For now assume all conditional loads/stores support unaligned
5717 access without any special code. */
5718 if (is_gimple_call (stmt)
5719 && gimple_call_internal_p (stmt)
5720 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5721 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5722 return dr_unaligned_supported;
5723
5724 if (loop_vinfo)
5725 {
5726 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5727 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5728 }
5729
5730 /* Possibly unaligned access. */
5731
5732 /* We can choose between using the implicit realignment scheme (generating
5733 a misaligned_move stmt) and the explicit realignment scheme (generating
5734 aligned loads with a REALIGN_LOAD). There are two variants to the
5735 explicit realignment scheme: optimized, and unoptimized.
5736 We can optimize the realignment only if the step between consecutive
5737 vector loads is equal to the vector size. Since the vector memory
5738 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5739 is guaranteed that the misalignment amount remains the same throughout the
5740 execution of the vectorized loop. Therefore, we can create the
5741 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5742 at the loop preheader.
5743
5744 However, in the case of outer-loop vectorization, when vectorizing a
5745 memory access in the inner-loop nested within the LOOP that is now being
5746 vectorized, while it is guaranteed that the misalignment of the
5747 vectorized memory access will remain the same in different outer-loop
5748 iterations, it is *not* guaranteed that is will remain the same throughout
5749 the execution of the inner-loop. This is because the inner-loop advances
5750 with the original scalar step (and not in steps of VS). If the inner-loop
5751 step happens to be a multiple of VS, then the misalignment remains fixed
5752 and we can use the optimized realignment scheme. For example:
5753
5754 for (i=0; i<N; i++)
5755 for (j=0; j<M; j++)
5756 s += a[i+j];
5757
5758 When vectorizing the i-loop in the above example, the step between
5759 consecutive vector loads is 1, and so the misalignment does not remain
5760 fixed across the execution of the inner-loop, and the realignment cannot
5761 be optimized (as illustrated in the following pseudo vectorized loop):
5762
5763 for (i=0; i<N; i+=4)
5764 for (j=0; j<M; j++){
5765 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5766 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5767 // (assuming that we start from an aligned address).
5768 }
5769
5770 We therefore have to use the unoptimized realignment scheme:
5771
5772 for (i=0; i<N; i+=4)
5773 for (j=k; j<M; j+=4)
5774 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5775 // that the misalignment of the initial address is
5776 // 0).
5777
5778 The loop can then be vectorized as follows:
5779
5780 for (k=0; k<4; k++){
5781 rt = get_realignment_token (&vp[k]);
5782 for (i=0; i<N; i+=4){
5783 v1 = vp[i+k];
5784 for (j=k; j<M; j+=4){
5785 v2 = vp[i+j+VS-1];
5786 va = REALIGN_LOAD <v1,v2,rt>;
5787 vs += va;
5788 v1 = v2;
5789 }
5790 }
5791 } */
5792
5793 if (DR_IS_READ (dr))
5794 {
5795 bool is_packed = false;
5796 tree type = (TREE_TYPE (DR_REF (dr)));
5797
5798 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5799 && (!targetm.vectorize.builtin_mask_for_load
5800 || targetm.vectorize.builtin_mask_for_load ()))
5801 {
5802 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5803 if ((nested_in_vect_loop
5804 && (TREE_INT_CST_LOW (DR_STEP (dr))
5805 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5806 || !loop_vinfo)
5807 return dr_explicit_realign;
5808 else
5809 return dr_explicit_realign_optimized;
5810 }
5811 if (!known_alignment_for_access_p (dr))
5812 is_packed = not_size_aligned (DR_REF (dr));
5813
5814 if ((TYPE_USER_ALIGN (type) && !is_packed)
5815 || targetm.vectorize.
5816 support_vector_misalignment (mode, type,
5817 DR_MISALIGNMENT (dr), is_packed))
5818 /* Can't software pipeline the loads, but can at least do them. */
5819 return dr_unaligned_supported;
5820 }
5821 else
5822 {
5823 bool is_packed = false;
5824 tree type = (TREE_TYPE (DR_REF (dr)));
5825
5826 if (!known_alignment_for_access_p (dr))
5827 is_packed = not_size_aligned (DR_REF (dr));
5828
5829 if ((TYPE_USER_ALIGN (type) && !is_packed)
5830 || targetm.vectorize.
5831 support_vector_misalignment (mode, type,
5832 DR_MISALIGNMENT (dr), is_packed))
5833 return dr_unaligned_supported;
5834 }
5835
5836 /* Unsupported. */
5837 return dr_unaligned_unsupported;
5838 }