re PR tree-optimization/56118 (Piecewise vector / complex initialization from constan...
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "predict.h"
31 #include "tm_p.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "cgraph.h"
35 #include "dumpfile.h"
36 #include "alias.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "tree-eh.h"
40 #include "gimplify.h"
41 #include "gimple-iterator.h"
42 #include "gimplify-me.h"
43 #include "tree-ssa-loop-ivopts.h"
44 #include "tree-ssa-loop-manip.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "expr.h"
50 #include "builtins.h"
51 #include "params.h"
52
53 /* Return true if load- or store-lanes optab OPTAB is implemented for
54 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
55
56 static bool
57 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
58 tree vectype, unsigned HOST_WIDE_INT count)
59 {
60 machine_mode mode, array_mode;
61 bool limit_p;
62
63 mode = TYPE_MODE (vectype);
64 limit_p = !targetm.array_mode_supported_p (mode, count);
65 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
66 MODE_INT, limit_p);
67
68 if (array_mode == BLKmode)
69 {
70 if (dump_enabled_p ())
71 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
72 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]\n",
73 GET_MODE_NAME (mode), count);
74 return false;
75 }
76
77 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
78 {
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
81 "cannot use %s<%s><%s>\n", name,
82 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
83 return false;
84 }
85
86 if (dump_enabled_p ())
87 dump_printf_loc (MSG_NOTE, vect_location,
88 "can use %s<%s><%s>\n", name, GET_MODE_NAME (array_mode),
89 GET_MODE_NAME (mode));
90
91 return true;
92 }
93
94
95 /* Return the smallest scalar part of STMT.
96 This is used to determine the vectype of the stmt. We generally set the
97 vectype according to the type of the result (lhs). For stmts whose
98 result-type is different than the type of the arguments (e.g., demotion,
99 promotion), vectype will be reset appropriately (later). Note that we have
100 to visit the smallest datatype in this function, because that determines the
101 VF. If the smallest datatype in the loop is present only as the rhs of a
102 promotion operation - we'd miss it.
103 Such a case, where a variable of this datatype does not appear in the lhs
104 anywhere in the loop, can only occur if it's an invariant: e.g.:
105 'int_x = (int) short_inv', which we'd expect to have been optimized away by
106 invariant motion. However, we cannot rely on invariant motion to always
107 take invariants out of the loop, and so in the case of promotion we also
108 have to check the rhs.
109 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
110 types. */
111
112 tree
113 vect_get_smallest_scalar_type (gimple *stmt, HOST_WIDE_INT *lhs_size_unit,
114 HOST_WIDE_INT *rhs_size_unit)
115 {
116 tree scalar_type = gimple_expr_type (stmt);
117 HOST_WIDE_INT lhs, rhs;
118
119 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
120
121 if (is_gimple_assign (stmt)
122 && (gimple_assign_cast_p (stmt)
123 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
124 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
125 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
126 {
127 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
128
129 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
130 if (rhs < lhs)
131 scalar_type = rhs_type;
132 }
133
134 *lhs_size_unit = lhs;
135 *rhs_size_unit = rhs;
136 return scalar_type;
137 }
138
139
140 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
141 tested at run-time. Return TRUE if DDR was successfully inserted.
142 Return false if versioning is not supported. */
143
144 static bool
145 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
146 {
147 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
148
149 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
150 return false;
151
152 if (dump_enabled_p ())
153 {
154 dump_printf_loc (MSG_NOTE, vect_location,
155 "mark for run-time aliasing test between ");
156 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
157 dump_printf (MSG_NOTE, " and ");
158 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
159 dump_printf (MSG_NOTE, "\n");
160 }
161
162 if (optimize_loop_nest_for_size_p (loop))
163 {
164 if (dump_enabled_p ())
165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
166 "versioning not supported when optimizing"
167 " for size.\n");
168 return false;
169 }
170
171 /* FORNOW: We don't support versioning with outer-loop vectorization. */
172 if (loop->inner)
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
176 "versioning not yet supported for outer-loops.\n");
177 return false;
178 }
179
180 /* FORNOW: We don't support creating runtime alias tests for non-constant
181 step. */
182 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
183 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
184 {
185 if (dump_enabled_p ())
186 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
187 "versioning not yet supported for non-constant "
188 "step\n");
189 return false;
190 }
191
192 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
193 return true;
194 }
195
196
197 /* Function vect_analyze_data_ref_dependence.
198
199 Return TRUE if there (might) exist a dependence between a memory-reference
200 DRA and a memory-reference DRB. When versioning for alias may check a
201 dependence at run-time, return FALSE. Adjust *MAX_VF according to
202 the data dependence. */
203
204 static bool
205 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
206 loop_vec_info loop_vinfo, int *max_vf)
207 {
208 unsigned int i;
209 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
210 struct data_reference *dra = DDR_A (ddr);
211 struct data_reference *drb = DDR_B (ddr);
212 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
213 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
214 lambda_vector dist_v;
215 unsigned int loop_depth;
216
217 /* In loop analysis all data references should be vectorizable. */
218 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
219 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
220 gcc_unreachable ();
221
222 /* Independent data accesses. */
223 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
224 return false;
225
226 if (dra == drb
227 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
228 return false;
229
230 /* Even if we have an anti-dependence then, as the vectorized loop covers at
231 least two scalar iterations, there is always also a true dependence.
232 As the vectorizer does not re-order loads and stores we can ignore
233 the anti-dependence if TBAA can disambiguate both DRs similar to the
234 case with known negative distance anti-dependences (positive
235 distance anti-dependences would violate TBAA constraints). */
236 if (((DR_IS_READ (dra) && DR_IS_WRITE (drb))
237 || (DR_IS_WRITE (dra) && DR_IS_READ (drb)))
238 && !alias_sets_conflict_p (get_alias_set (DR_REF (dra)),
239 get_alias_set (DR_REF (drb))))
240 return false;
241
242 /* Unknown data dependence. */
243 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
244 {
245 /* If user asserted safelen consecutive iterations can be
246 executed concurrently, assume independence. */
247 if (loop->safelen >= 2)
248 {
249 if (loop->safelen < *max_vf)
250 *max_vf = loop->safelen;
251 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
252 return false;
253 }
254
255 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
256 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
257 {
258 if (dump_enabled_p ())
259 {
260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
261 "versioning for alias not supported for: "
262 "can't determine dependence between ");
263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
264 DR_REF (dra));
265 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
266 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
267 DR_REF (drb));
268 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
269 }
270 return true;
271 }
272
273 if (dump_enabled_p ())
274 {
275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
276 "versioning for alias required: "
277 "can't determine dependence between ");
278 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
279 DR_REF (dra));
280 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
281 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
282 DR_REF (drb));
283 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
284 }
285
286 /* Add to list of ddrs that need to be tested at run-time. */
287 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
288 }
289
290 /* Known data dependence. */
291 if (DDR_NUM_DIST_VECTS (ddr) == 0)
292 {
293 /* If user asserted safelen consecutive iterations can be
294 executed concurrently, assume independence. */
295 if (loop->safelen >= 2)
296 {
297 if (loop->safelen < *max_vf)
298 *max_vf = loop->safelen;
299 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = false;
300 return false;
301 }
302
303 if (STMT_VINFO_GATHER_SCATTER_P (stmtinfo_a)
304 || STMT_VINFO_GATHER_SCATTER_P (stmtinfo_b))
305 {
306 if (dump_enabled_p ())
307 {
308 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
309 "versioning for alias not supported for: "
310 "bad dist vector for ");
311 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
312 DR_REF (dra));
313 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
314 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
315 DR_REF (drb));
316 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
317 }
318 return true;
319 }
320
321 if (dump_enabled_p ())
322 {
323 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
324 "versioning for alias required: "
325 "bad dist vector for ");
326 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
327 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
328 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
329 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
330 }
331 /* Add to list of ddrs that need to be tested at run-time. */
332 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
333 }
334
335 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
336 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
337 {
338 int dist = dist_v[loop_depth];
339
340 if (dump_enabled_p ())
341 dump_printf_loc (MSG_NOTE, vect_location,
342 "dependence distance = %d.\n", dist);
343
344 if (dist == 0)
345 {
346 if (dump_enabled_p ())
347 {
348 dump_printf_loc (MSG_NOTE, vect_location,
349 "dependence distance == 0 between ");
350 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
351 dump_printf (MSG_NOTE, " and ");
352 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
353 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
354 }
355
356 /* When we perform grouped accesses and perform implicit CSE
357 by detecting equal accesses and doing disambiguation with
358 runtime alias tests like for
359 .. = a[i];
360 .. = a[i+1];
361 a[i] = ..;
362 a[i+1] = ..;
363 *p = ..;
364 .. = a[i];
365 .. = a[i+1];
366 where we will end up loading { a[i], a[i+1] } once, make
367 sure that inserting group loads before the first load and
368 stores after the last store will do the right thing.
369 Similar for groups like
370 a[i] = ...;
371 ... = a[i];
372 a[i+1] = ...;
373 where loads from the group interleave with the store. */
374 if (STMT_VINFO_GROUPED_ACCESS (stmtinfo_a)
375 || STMT_VINFO_GROUPED_ACCESS (stmtinfo_b))
376 {
377 gimple *earlier_stmt;
378 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
379 if (DR_IS_WRITE
380 (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
381 {
382 if (dump_enabled_p ())
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "READ_WRITE dependence in interleaving."
385 "\n");
386 return true;
387 }
388 }
389
390 continue;
391 }
392
393 if (dist > 0 && DDR_REVERSED_P (ddr))
394 {
395 /* If DDR_REVERSED_P the order of the data-refs in DDR was
396 reversed (to make distance vector positive), and the actual
397 distance is negative. */
398 if (dump_enabled_p ())
399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
400 "dependence distance negative.\n");
401 /* Record a negative dependence distance to later limit the
402 amount of stmt copying / unrolling we can perform.
403 Only need to handle read-after-write dependence. */
404 if (DR_IS_READ (drb)
405 && (STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) == 0
406 || STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) > (unsigned)dist))
407 STMT_VINFO_MIN_NEG_DIST (stmtinfo_b) = dist;
408 continue;
409 }
410
411 if (abs (dist) >= 2
412 && abs (dist) < *max_vf)
413 {
414 /* The dependence distance requires reduction of the maximal
415 vectorization factor. */
416 *max_vf = abs (dist);
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_NOTE, vect_location,
419 "adjusting maximal vectorization factor to %i\n",
420 *max_vf);
421 }
422
423 if (abs (dist) >= *max_vf)
424 {
425 /* Dependence distance does not create dependence, as far as
426 vectorization is concerned, in this case. */
427 if (dump_enabled_p ())
428 dump_printf_loc (MSG_NOTE, vect_location,
429 "dependence distance >= VF.\n");
430 continue;
431 }
432
433 if (dump_enabled_p ())
434 {
435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
436 "not vectorized, possible dependence "
437 "between data-refs ");
438 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
439 dump_printf (MSG_NOTE, " and ");
440 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
441 dump_printf (MSG_NOTE, "\n");
442 }
443
444 return true;
445 }
446
447 return false;
448 }
449
450 /* Function vect_analyze_data_ref_dependences.
451
452 Examine all the data references in the loop, and make sure there do not
453 exist any data dependences between them. Set *MAX_VF according to
454 the maximum vectorization factor the data dependences allow. */
455
456 bool
457 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
458 {
459 unsigned int i;
460 struct data_dependence_relation *ddr;
461
462 if (dump_enabled_p ())
463 dump_printf_loc (MSG_NOTE, vect_location,
464 "=== vect_analyze_data_ref_dependences ===\n");
465
466 LOOP_VINFO_DDRS (loop_vinfo)
467 .create (LOOP_VINFO_DATAREFS (loop_vinfo).length ()
468 * LOOP_VINFO_DATAREFS (loop_vinfo).length ());
469 LOOP_VINFO_NO_DATA_DEPENDENCIES (loop_vinfo) = true;
470 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
471 &LOOP_VINFO_DDRS (loop_vinfo),
472 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
473 return false;
474
475 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
476 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
477 return false;
478
479 return true;
480 }
481
482
483 /* Function vect_slp_analyze_data_ref_dependence.
484
485 Return TRUE if there (might) exist a dependence between a memory-reference
486 DRA and a memory-reference DRB. When versioning for alias may check a
487 dependence at run-time, return FALSE. Adjust *MAX_VF according to
488 the data dependence. */
489
490 static bool
491 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
492 {
493 struct data_reference *dra = DDR_A (ddr);
494 struct data_reference *drb = DDR_B (ddr);
495
496 /* We need to check dependences of statements marked as unvectorizable
497 as well, they still can prohibit vectorization. */
498
499 /* Independent data accesses. */
500 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
501 return false;
502
503 if (dra == drb)
504 return false;
505
506 /* Read-read is OK. */
507 if (DR_IS_READ (dra) && DR_IS_READ (drb))
508 return false;
509
510 /* If dra and drb are part of the same interleaving chain consider
511 them independent. */
512 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
513 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
514 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
515 return false;
516
517 /* Unknown data dependence. */
518 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
519 {
520 if (dump_enabled_p ())
521 {
522 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
523 "can't determine dependence between ");
524 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
526 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
527 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
528 }
529 }
530 else if (dump_enabled_p ())
531 {
532 dump_printf_loc (MSG_NOTE, vect_location,
533 "determined dependence between ");
534 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
535 dump_printf (MSG_NOTE, " and ");
536 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
537 dump_printf (MSG_NOTE, "\n");
538 }
539
540 /* We do not vectorize basic blocks with write-write dependencies. */
541 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
542 return true;
543
544 /* If we have a read-write dependence check that the load is before the store.
545 When we vectorize basic blocks, vector load can be only before
546 corresponding scalar load, and vector store can be only after its
547 corresponding scalar store. So the order of the acceses is preserved in
548 case the load is before the store. */
549 gimple *earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
550 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
551 {
552 /* That only holds for load-store pairs taking part in vectorization. */
553 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dra)))
554 && STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (drb))))
555 return false;
556 }
557
558 return true;
559 }
560
561
562 /* Analyze dependences involved in the transform of SLP NODE. */
563
564 static bool
565 vect_slp_analyze_node_dependences (slp_instance instance, slp_tree node)
566 {
567 /* This walks over all stmts involved in the SLP load/store done
568 in NODE verifying we can sink them up to the last stmt in the
569 group. */
570 gimple *last_access = vect_find_last_scalar_stmt_in_slp (node);
571 for (unsigned k = 0; k < SLP_INSTANCE_GROUP_SIZE (instance); ++k)
572 {
573 gimple *access = SLP_TREE_SCALAR_STMTS (node)[k];
574 if (access == last_access)
575 continue;
576 stmt_vec_info access_stmt_info = vinfo_for_stmt (access);
577 gimple_stmt_iterator gsi = gsi_for_stmt (access);
578 gsi_next (&gsi);
579 for (; gsi_stmt (gsi) != last_access; gsi_next (&gsi))
580 {
581 gimple *stmt = gsi_stmt (gsi);
582 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
583 if (!STMT_VINFO_DATA_REF (stmt_info)
584 || (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
585 && DR_IS_READ (STMT_VINFO_DATA_REF (access_stmt_info))))
586 continue;
587
588 ddr_p ddr = initialize_data_dependence_relation
589 (STMT_VINFO_DATA_REF (access_stmt_info),
590 STMT_VINFO_DATA_REF (stmt_info), vNULL);
591 if (vect_slp_analyze_data_ref_dependence (ddr))
592 {
593 /* ??? If the dependence analysis failed we can resort to the
594 alias oracle which can handle more kinds of stmts. */
595 free_dependence_relation (ddr);
596 return false;
597 }
598 free_dependence_relation (ddr);
599 }
600 }
601 return true;
602 }
603
604
605 /* Function vect_analyze_data_ref_dependences.
606
607 Examine all the data references in the basic-block, and make sure there
608 do not exist any data dependences between them. Set *MAX_VF according to
609 the maximum vectorization factor the data dependences allow. */
610
611 bool
612 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
613 {
614 if (dump_enabled_p ())
615 dump_printf_loc (MSG_NOTE, vect_location,
616 "=== vect_slp_analyze_data_ref_dependences ===\n");
617
618 slp_instance instance;
619 slp_tree load;
620 unsigned int i, j;
621 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
622 {
623 bool remove = false;
624 /* Verify we can sink loads to the vectorized stmt insert location. */
625 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, load)
626 if (! vect_slp_analyze_node_dependences (instance, load))
627 {
628 remove = true;
629 break;
630 }
631 /* Verify we can sink stores to the vectorized stmt insert location. */
632 slp_tree store = SLP_INSTANCE_TREE (instance);
633 if (!remove
634 && STMT_VINFO_DATA_REF
635 (vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (store)[0]))
636 && ! vect_slp_analyze_node_dependences (instance, store))
637 remove = true;
638 if (remove)
639 {
640 dump_printf_loc (MSG_NOTE, vect_location,
641 "removing SLP instance operations starting from: ");
642 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
643 SLP_TREE_SCALAR_STMTS
644 (SLP_INSTANCE_TREE (instance))[0], 0);
645 vect_free_slp_instance (instance);
646 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
647 }
648 i++;
649 }
650
651 if (!BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
652 return false;
653
654 return true;
655 }
656
657
658 /* Function vect_compute_data_ref_alignment
659
660 Compute the misalignment of the data reference DR.
661
662 Output:
663 1. If during the misalignment computation it is found that the data reference
664 cannot be vectorized then false is returned.
665 2. DR_MISALIGNMENT (DR) is defined.
666
667 FOR NOW: No analysis is actually performed. Misalignment is calculated
668 only for trivial cases. TODO. */
669
670 static bool
671 vect_compute_data_ref_alignment (struct data_reference *dr)
672 {
673 gimple *stmt = DR_STMT (dr);
674 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
675 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
676 struct loop *loop = NULL;
677 tree ref = DR_REF (dr);
678 tree vectype;
679 tree base, base_addr;
680 tree misalign = NULL_TREE;
681 tree aligned_to;
682 unsigned HOST_WIDE_INT alignment;
683
684 if (dump_enabled_p ())
685 dump_printf_loc (MSG_NOTE, vect_location,
686 "vect_compute_data_ref_alignment:\n");
687
688 if (loop_vinfo)
689 loop = LOOP_VINFO_LOOP (loop_vinfo);
690
691 /* Initialize misalignment to unknown. */
692 SET_DR_MISALIGNMENT (dr, -1);
693
694 if (tree_fits_shwi_p (DR_STEP (dr)))
695 misalign = DR_INIT (dr);
696 aligned_to = DR_ALIGNED_TO (dr);
697 base_addr = DR_BASE_ADDRESS (dr);
698 vectype = STMT_VINFO_VECTYPE (stmt_info);
699
700 /* In case the dataref is in an inner-loop of the loop that is being
701 vectorized (LOOP), we use the base and misalignment information
702 relative to the outer-loop (LOOP). This is ok only if the misalignment
703 stays the same throughout the execution of the inner-loop, which is why
704 we have to check that the stride of the dataref in the inner-loop evenly
705 divides by the vector size. */
706 if (loop && nested_in_vect_loop_p (loop, stmt))
707 {
708 tree step = DR_STEP (dr);
709
710 if (tree_fits_shwi_p (step)
711 && tree_to_shwi (step) % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
712 {
713 if (dump_enabled_p ())
714 dump_printf_loc (MSG_NOTE, vect_location,
715 "inner step divides the vector-size.\n");
716 misalign = STMT_VINFO_DR_INIT (stmt_info);
717 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
718 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
719 }
720 else
721 {
722 if (dump_enabled_p ())
723 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
724 "inner step doesn't divide the vector-size.\n");
725 misalign = NULL_TREE;
726 }
727 }
728
729 /* Similarly we can only use base and misalignment information relative to
730 an innermost loop if the misalignment stays the same throughout the
731 execution of the loop. As above, this is the case if the stride of
732 the dataref evenly divides by the vector size. */
733 else
734 {
735 tree step = DR_STEP (dr);
736 unsigned vf = loop ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) : 1;
737
738 if (tree_fits_shwi_p (step)
739 && ((tree_to_shwi (step) * vf)
740 % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0))
741 {
742 if (dump_enabled_p ())
743 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
744 "step doesn't divide the vector-size.\n");
745 misalign = NULL_TREE;
746 }
747 }
748
749 /* To look at alignment of the base we have to preserve an inner MEM_REF
750 as that carries alignment information of the actual access. */
751 base = ref;
752 while (handled_component_p (base))
753 base = TREE_OPERAND (base, 0);
754 if (TREE_CODE (base) == MEM_REF)
755 base = build2 (MEM_REF, TREE_TYPE (base), base_addr,
756 build_int_cst (TREE_TYPE (TREE_OPERAND (base, 1)), 0));
757 unsigned int base_alignment = get_object_alignment (base);
758
759 if (base_alignment >= TYPE_ALIGN (TREE_TYPE (vectype)))
760 DR_VECT_AUX (dr)->base_element_aligned = true;
761
762 alignment = TYPE_ALIGN_UNIT (vectype);
763
764 if ((compare_tree_int (aligned_to, alignment) < 0)
765 || !misalign)
766 {
767 if (dump_enabled_p ())
768 {
769 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
770 "Unknown alignment for access: ");
771 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
772 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
773 }
774 return true;
775 }
776
777 if (base_alignment < TYPE_ALIGN (vectype))
778 {
779 /* Strip an inner MEM_REF to a bare decl if possible. */
780 if (TREE_CODE (base) == MEM_REF
781 && integer_zerop (TREE_OPERAND (base, 1))
782 && TREE_CODE (TREE_OPERAND (base, 0)) == ADDR_EXPR)
783 base = TREE_OPERAND (TREE_OPERAND (base, 0), 0);
784
785 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype)))
786 {
787 if (dump_enabled_p ())
788 {
789 dump_printf_loc (MSG_NOTE, vect_location,
790 "can't force alignment of ref: ");
791 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
792 dump_printf (MSG_NOTE, "\n");
793 }
794 return true;
795 }
796
797 /* Force the alignment of the decl.
798 NOTE: This is the only change to the code we make during
799 the analysis phase, before deciding to vectorize the loop. */
800 if (dump_enabled_p ())
801 {
802 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
803 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
804 dump_printf (MSG_NOTE, "\n");
805 }
806
807 DR_VECT_AUX (dr)->base_decl = base;
808 DR_VECT_AUX (dr)->base_misaligned = true;
809 DR_VECT_AUX (dr)->base_element_aligned = true;
810 }
811
812 /* If this is a backward running DR then first access in the larger
813 vectype actually is N-1 elements before the address in the DR.
814 Adjust misalign accordingly. */
815 if (tree_int_cst_sgn (DR_STEP (dr)) < 0)
816 {
817 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
818 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
819 otherwise we wouldn't be here. */
820 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
821 /* PLUS because DR_STEP was negative. */
822 misalign = size_binop (PLUS_EXPR, misalign, offset);
823 }
824
825 SET_DR_MISALIGNMENT (dr,
826 wi::mod_floor (misalign, alignment, SIGNED).to_uhwi ());
827
828 if (dump_enabled_p ())
829 {
830 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
831 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
832 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
833 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
834 }
835
836 return true;
837 }
838
839
840 /* Function vect_compute_data_refs_alignment
841
842 Compute the misalignment of data references in the loop.
843 Return FALSE if a data reference is found that cannot be vectorized. */
844
845 static bool
846 vect_compute_data_refs_alignment (vec_info *vinfo)
847 {
848 vec<data_reference_p> datarefs = vinfo->datarefs;
849 struct data_reference *dr;
850 unsigned int i;
851
852 FOR_EACH_VEC_ELT (datarefs, i, dr)
853 {
854 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
855 if (STMT_VINFO_VECTORIZABLE (stmt_info)
856 && !vect_compute_data_ref_alignment (dr))
857 {
858 /* Strided accesses perform only component accesses, misalignment
859 information is irrelevant for them. */
860 if (STMT_VINFO_STRIDED_P (stmt_info)
861 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
862 continue;
863
864 if (is_a <bb_vec_info> (vinfo))
865 {
866 /* Mark unsupported statement as unvectorizable. */
867 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
868 continue;
869 }
870 else
871 return false;
872 }
873 }
874
875 return true;
876 }
877
878
879 /* Function vect_update_misalignment_for_peel
880
881 DR - the data reference whose misalignment is to be adjusted.
882 DR_PEEL - the data reference whose misalignment is being made
883 zero in the vector loop by the peel.
884 NPEEL - the number of iterations in the peel loop if the misalignment
885 of DR_PEEL is known at compile time. */
886
887 static void
888 vect_update_misalignment_for_peel (struct data_reference *dr,
889 struct data_reference *dr_peel, int npeel)
890 {
891 unsigned int i;
892 vec<dr_p> same_align_drs;
893 struct data_reference *current_dr;
894 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
895 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
896 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
897 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
898
899 /* For interleaved data accesses the step in the loop must be multiplied by
900 the size of the interleaving group. */
901 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
902 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
903 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
904 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
905
906 /* It can be assumed that the data refs with the same alignment as dr_peel
907 are aligned in the vector loop. */
908 same_align_drs
909 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
910 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
911 {
912 if (current_dr != dr)
913 continue;
914 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
915 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
916 SET_DR_MISALIGNMENT (dr, 0);
917 return;
918 }
919
920 if (known_alignment_for_access_p (dr)
921 && known_alignment_for_access_p (dr_peel))
922 {
923 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
924 int misal = DR_MISALIGNMENT (dr);
925 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
926 misal += negative ? -npeel * dr_size : npeel * dr_size;
927 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
928 SET_DR_MISALIGNMENT (dr, misal);
929 return;
930 }
931
932 if (dump_enabled_p ())
933 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.\n");
934 SET_DR_MISALIGNMENT (dr, -1);
935 }
936
937
938 /* Function vect_verify_datarefs_alignment
939
940 Return TRUE if all data references in the loop can be
941 handled with respect to alignment. */
942
943 bool
944 vect_verify_datarefs_alignment (vec_info *vinfo)
945 {
946 vec<data_reference_p> datarefs = vinfo->datarefs;
947 struct data_reference *dr;
948 enum dr_alignment_support supportable_dr_alignment;
949 unsigned int i;
950
951 FOR_EACH_VEC_ELT (datarefs, i, dr)
952 {
953 gimple *stmt = DR_STMT (dr);
954 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
955
956 if (!STMT_VINFO_RELEVANT_P (stmt_info))
957 continue;
958
959 /* For interleaving, only the alignment of the first access matters.
960 Skip statements marked as not vectorizable. */
961 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
962 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
963 || !STMT_VINFO_VECTORIZABLE (stmt_info))
964 continue;
965
966 /* Strided accesses perform only component accesses, alignment is
967 irrelevant for them. */
968 if (STMT_VINFO_STRIDED_P (stmt_info)
969 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
970 continue;
971
972 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
973 if (!supportable_dr_alignment)
974 {
975 if (dump_enabled_p ())
976 {
977 if (DR_IS_READ (dr))
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "not vectorized: unsupported unaligned load.");
980 else
981 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
982 "not vectorized: unsupported unaligned "
983 "store.");
984
985 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
986 DR_REF (dr));
987 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
988 }
989 return false;
990 }
991 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
992 dump_printf_loc (MSG_NOTE, vect_location,
993 "Vectorizing an unaligned access.\n");
994 }
995 return true;
996 }
997
998 /* Given an memory reference EXP return whether its alignment is less
999 than its size. */
1000
1001 static bool
1002 not_size_aligned (tree exp)
1003 {
1004 if (!tree_fits_uhwi_p (TYPE_SIZE (TREE_TYPE (exp))))
1005 return true;
1006
1007 return (tree_to_uhwi (TYPE_SIZE (TREE_TYPE (exp)))
1008 > get_object_alignment (exp));
1009 }
1010
1011 /* Function vector_alignment_reachable_p
1012
1013 Return true if vector alignment for DR is reachable by peeling
1014 a few loop iterations. Return false otherwise. */
1015
1016 static bool
1017 vector_alignment_reachable_p (struct data_reference *dr)
1018 {
1019 gimple *stmt = DR_STMT (dr);
1020 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1021 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1022
1023 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1024 {
1025 /* For interleaved access we peel only if number of iterations in
1026 the prolog loop ({VF - misalignment}), is a multiple of the
1027 number of the interleaved accesses. */
1028 int elem_size, mis_in_elements;
1029 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1030
1031 /* FORNOW: handle only known alignment. */
1032 if (!known_alignment_for_access_p (dr))
1033 return false;
1034
1035 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1036 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1037
1038 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1039 return false;
1040 }
1041
1042 /* If misalignment is known at the compile time then allow peeling
1043 only if natural alignment is reachable through peeling. */
1044 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1045 {
1046 HOST_WIDE_INT elmsize =
1047 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1048 if (dump_enabled_p ())
1049 {
1050 dump_printf_loc (MSG_NOTE, vect_location,
1051 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1052 dump_printf (MSG_NOTE,
1053 ". misalignment = %d.\n", DR_MISALIGNMENT (dr));
1054 }
1055 if (DR_MISALIGNMENT (dr) % elmsize)
1056 {
1057 if (dump_enabled_p ())
1058 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1059 "data size does not divide the misalignment.\n");
1060 return false;
1061 }
1062 }
1063
1064 if (!known_alignment_for_access_p (dr))
1065 {
1066 tree type = TREE_TYPE (DR_REF (dr));
1067 bool is_packed = not_size_aligned (DR_REF (dr));
1068 if (dump_enabled_p ())
1069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1070 "Unknown misalignment, is_packed = %d\n",is_packed);
1071 if ((TYPE_USER_ALIGN (type) && !is_packed)
1072 || targetm.vectorize.vector_alignment_reachable (type, is_packed))
1073 return true;
1074 else
1075 return false;
1076 }
1077
1078 return true;
1079 }
1080
1081
1082 /* Calculate the cost of the memory access represented by DR. */
1083
1084 static void
1085 vect_get_data_access_cost (struct data_reference *dr,
1086 unsigned int *inside_cost,
1087 unsigned int *outside_cost,
1088 stmt_vector_for_cost *body_cost_vec)
1089 {
1090 gimple *stmt = DR_STMT (dr);
1091 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1092 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1093 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1094 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1095 int ncopies = vf / nunits;
1096
1097 if (DR_IS_READ (dr))
1098 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1099 NULL, body_cost_vec, false);
1100 else
1101 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1102
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_NOTE, vect_location,
1105 "vect_get_data_access_cost: inside_cost = %d, "
1106 "outside_cost = %d.\n", *inside_cost, *outside_cost);
1107 }
1108
1109
1110 typedef struct _vect_peel_info
1111 {
1112 int npeel;
1113 struct data_reference *dr;
1114 unsigned int count;
1115 } *vect_peel_info;
1116
1117 typedef struct _vect_peel_extended_info
1118 {
1119 struct _vect_peel_info peel_info;
1120 unsigned int inside_cost;
1121 unsigned int outside_cost;
1122 stmt_vector_for_cost body_cost_vec;
1123 } *vect_peel_extended_info;
1124
1125
1126 /* Peeling hashtable helpers. */
1127
1128 struct peel_info_hasher : free_ptr_hash <_vect_peel_info>
1129 {
1130 static inline hashval_t hash (const _vect_peel_info *);
1131 static inline bool equal (const _vect_peel_info *, const _vect_peel_info *);
1132 };
1133
1134 inline hashval_t
1135 peel_info_hasher::hash (const _vect_peel_info *peel_info)
1136 {
1137 return (hashval_t) peel_info->npeel;
1138 }
1139
1140 inline bool
1141 peel_info_hasher::equal (const _vect_peel_info *a, const _vect_peel_info *b)
1142 {
1143 return (a->npeel == b->npeel);
1144 }
1145
1146
1147 /* Insert DR into peeling hash table with NPEEL as key. */
1148
1149 static void
1150 vect_peeling_hash_insert (hash_table<peel_info_hasher> *peeling_htab,
1151 loop_vec_info loop_vinfo, struct data_reference *dr,
1152 int npeel)
1153 {
1154 struct _vect_peel_info elem, *slot;
1155 _vect_peel_info **new_slot;
1156 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1157
1158 elem.npeel = npeel;
1159 slot = peeling_htab->find (&elem);
1160 if (slot)
1161 slot->count++;
1162 else
1163 {
1164 slot = XNEW (struct _vect_peel_info);
1165 slot->npeel = npeel;
1166 slot->dr = dr;
1167 slot->count = 1;
1168 new_slot = peeling_htab->find_slot (slot, INSERT);
1169 *new_slot = slot;
1170 }
1171
1172 if (!supportable_dr_alignment
1173 && unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1174 slot->count += VECT_MAX_COST;
1175 }
1176
1177
1178 /* Traverse peeling hash table to find peeling option that aligns maximum
1179 number of data accesses. */
1180
1181 int
1182 vect_peeling_hash_get_most_frequent (_vect_peel_info **slot,
1183 _vect_peel_extended_info *max)
1184 {
1185 vect_peel_info elem = *slot;
1186
1187 if (elem->count > max->peel_info.count
1188 || (elem->count == max->peel_info.count
1189 && max->peel_info.npeel > elem->npeel))
1190 {
1191 max->peel_info.npeel = elem->npeel;
1192 max->peel_info.count = elem->count;
1193 max->peel_info.dr = elem->dr;
1194 }
1195
1196 return 1;
1197 }
1198
1199
1200 /* Traverse peeling hash table and calculate cost for each peeling option.
1201 Find the one with the lowest cost. */
1202
1203 int
1204 vect_peeling_hash_get_lowest_cost (_vect_peel_info **slot,
1205 _vect_peel_extended_info *min)
1206 {
1207 vect_peel_info elem = *slot;
1208 int save_misalignment, dummy;
1209 unsigned int inside_cost = 0, outside_cost = 0, i;
1210 gimple *stmt = DR_STMT (elem->dr);
1211 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1212 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1213 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1214 struct data_reference *dr;
1215 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1216
1217 prologue_cost_vec.create (2);
1218 body_cost_vec.create (2);
1219 epilogue_cost_vec.create (2);
1220
1221 FOR_EACH_VEC_ELT (datarefs, i, dr)
1222 {
1223 stmt = DR_STMT (dr);
1224 stmt_info = vinfo_for_stmt (stmt);
1225 /* For interleaving, only the alignment of the first access
1226 matters. */
1227 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1228 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1229 continue;
1230
1231 save_misalignment = DR_MISALIGNMENT (dr);
1232 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1233 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1234 &body_cost_vec);
1235 SET_DR_MISALIGNMENT (dr, save_misalignment);
1236 }
1237
1238 outside_cost += vect_get_known_peeling_cost
1239 (loop_vinfo, elem->npeel, &dummy,
1240 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1241 &prologue_cost_vec, &epilogue_cost_vec);
1242
1243 /* Prologue and epilogue costs are added to the target model later.
1244 These costs depend only on the scalar iteration cost, the
1245 number of peeling iterations finally chosen, and the number of
1246 misaligned statements. So discard the information found here. */
1247 prologue_cost_vec.release ();
1248 epilogue_cost_vec.release ();
1249
1250 if (inside_cost < min->inside_cost
1251 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1252 {
1253 min->inside_cost = inside_cost;
1254 min->outside_cost = outside_cost;
1255 min->body_cost_vec.release ();
1256 min->body_cost_vec = body_cost_vec;
1257 min->peel_info.dr = elem->dr;
1258 min->peel_info.npeel = elem->npeel;
1259 }
1260 else
1261 body_cost_vec.release ();
1262
1263 return 1;
1264 }
1265
1266
1267 /* Choose best peeling option by traversing peeling hash table and either
1268 choosing an option with the lowest cost (if cost model is enabled) or the
1269 option that aligns as many accesses as possible. */
1270
1271 static struct data_reference *
1272 vect_peeling_hash_choose_best_peeling (hash_table<peel_info_hasher> *peeling_htab,
1273 loop_vec_info loop_vinfo,
1274 unsigned int *npeel,
1275 stmt_vector_for_cost *body_cost_vec)
1276 {
1277 struct _vect_peel_extended_info res;
1278
1279 res.peel_info.dr = NULL;
1280 res.body_cost_vec = stmt_vector_for_cost ();
1281
1282 if (!unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1283 {
1284 res.inside_cost = INT_MAX;
1285 res.outside_cost = INT_MAX;
1286 peeling_htab->traverse <_vect_peel_extended_info *,
1287 vect_peeling_hash_get_lowest_cost> (&res);
1288 }
1289 else
1290 {
1291 res.peel_info.count = 0;
1292 peeling_htab->traverse <_vect_peel_extended_info *,
1293 vect_peeling_hash_get_most_frequent> (&res);
1294 }
1295
1296 *npeel = res.peel_info.npeel;
1297 *body_cost_vec = res.body_cost_vec;
1298 return res.peel_info.dr;
1299 }
1300
1301
1302 /* Function vect_enhance_data_refs_alignment
1303
1304 This pass will use loop versioning and loop peeling in order to enhance
1305 the alignment of data references in the loop.
1306
1307 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1308 original loop is to be vectorized. Any other loops that are created by
1309 the transformations performed in this pass - are not supposed to be
1310 vectorized. This restriction will be relaxed.
1311
1312 This pass will require a cost model to guide it whether to apply peeling
1313 or versioning or a combination of the two. For example, the scheme that
1314 intel uses when given a loop with several memory accesses, is as follows:
1315 choose one memory access ('p') which alignment you want to force by doing
1316 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1317 other accesses are not necessarily aligned, or (2) use loop versioning to
1318 generate one loop in which all accesses are aligned, and another loop in
1319 which only 'p' is necessarily aligned.
1320
1321 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1322 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1323 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1324
1325 Devising a cost model is the most critical aspect of this work. It will
1326 guide us on which access to peel for, whether to use loop versioning, how
1327 many versions to create, etc. The cost model will probably consist of
1328 generic considerations as well as target specific considerations (on
1329 powerpc for example, misaligned stores are more painful than misaligned
1330 loads).
1331
1332 Here are the general steps involved in alignment enhancements:
1333
1334 -- original loop, before alignment analysis:
1335 for (i=0; i<N; i++){
1336 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1337 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1338 }
1339
1340 -- After vect_compute_data_refs_alignment:
1341 for (i=0; i<N; i++){
1342 x = q[i]; # DR_MISALIGNMENT(q) = 3
1343 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1344 }
1345
1346 -- Possibility 1: we do loop versioning:
1347 if (p is aligned) {
1348 for (i=0; i<N; i++){ # loop 1A
1349 x = q[i]; # DR_MISALIGNMENT(q) = 3
1350 p[i] = y; # DR_MISALIGNMENT(p) = 0
1351 }
1352 }
1353 else {
1354 for (i=0; i<N; i++){ # loop 1B
1355 x = q[i]; # DR_MISALIGNMENT(q) = 3
1356 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1357 }
1358 }
1359
1360 -- Possibility 2: we do loop peeling:
1361 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1362 x = q[i];
1363 p[i] = y;
1364 }
1365 for (i = 3; i < N; i++){ # loop 2A
1366 x = q[i]; # DR_MISALIGNMENT(q) = 0
1367 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1368 }
1369
1370 -- Possibility 3: combination of loop peeling and versioning:
1371 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1372 x = q[i];
1373 p[i] = y;
1374 }
1375 if (p is aligned) {
1376 for (i = 3; i<N; i++){ # loop 3A
1377 x = q[i]; # DR_MISALIGNMENT(q) = 0
1378 p[i] = y; # DR_MISALIGNMENT(p) = 0
1379 }
1380 }
1381 else {
1382 for (i = 3; i<N; i++){ # loop 3B
1383 x = q[i]; # DR_MISALIGNMENT(q) = 0
1384 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1385 }
1386 }
1387
1388 These loops are later passed to loop_transform to be vectorized. The
1389 vectorizer will use the alignment information to guide the transformation
1390 (whether to generate regular loads/stores, or with special handling for
1391 misalignment). */
1392
1393 bool
1394 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1395 {
1396 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1397 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1398 enum dr_alignment_support supportable_dr_alignment;
1399 struct data_reference *dr0 = NULL, *first_store = NULL;
1400 struct data_reference *dr;
1401 unsigned int i, j;
1402 bool do_peeling = false;
1403 bool do_versioning = false;
1404 bool stat;
1405 gimple *stmt;
1406 stmt_vec_info stmt_info;
1407 unsigned int npeel = 0;
1408 bool all_misalignments_unknown = true;
1409 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1410 unsigned possible_npeel_number = 1;
1411 tree vectype;
1412 unsigned int nelements, mis, same_align_drs_max = 0;
1413 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost ();
1414 hash_table<peel_info_hasher> peeling_htab (1);
1415
1416 if (dump_enabled_p ())
1417 dump_printf_loc (MSG_NOTE, vect_location,
1418 "=== vect_enhance_data_refs_alignment ===\n");
1419
1420 /* Reset data so we can safely be called multiple times. */
1421 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1422 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = 0;
1423
1424 /* While cost model enhancements are expected in the future, the high level
1425 view of the code at this time is as follows:
1426
1427 A) If there is a misaligned access then see if peeling to align
1428 this access can make all data references satisfy
1429 vect_supportable_dr_alignment. If so, update data structures
1430 as needed and return true.
1431
1432 B) If peeling wasn't possible and there is a data reference with an
1433 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1434 then see if loop versioning checks can be used to make all data
1435 references satisfy vect_supportable_dr_alignment. If so, update
1436 data structures as needed and return true.
1437
1438 C) If neither peeling nor versioning were successful then return false if
1439 any data reference does not satisfy vect_supportable_dr_alignment.
1440
1441 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1442
1443 Note, Possibility 3 above (which is peeling and versioning together) is not
1444 being done at this time. */
1445
1446 /* (1) Peeling to force alignment. */
1447
1448 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1449 Considerations:
1450 + How many accesses will become aligned due to the peeling
1451 - How many accesses will become unaligned due to the peeling,
1452 and the cost of misaligned accesses.
1453 - The cost of peeling (the extra runtime checks, the increase
1454 in code size). */
1455
1456 FOR_EACH_VEC_ELT (datarefs, i, dr)
1457 {
1458 stmt = DR_STMT (dr);
1459 stmt_info = vinfo_for_stmt (stmt);
1460
1461 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1462 continue;
1463
1464 /* For interleaving, only the alignment of the first access
1465 matters. */
1466 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1467 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1468 continue;
1469
1470 /* For invariant accesses there is nothing to enhance. */
1471 if (integer_zerop (DR_STEP (dr)))
1472 continue;
1473
1474 /* Strided accesses perform only component accesses, alignment is
1475 irrelevant for them. */
1476 if (STMT_VINFO_STRIDED_P (stmt_info)
1477 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1478 continue;
1479
1480 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1481 do_peeling = vector_alignment_reachable_p (dr);
1482 if (do_peeling)
1483 {
1484 if (known_alignment_for_access_p (dr))
1485 {
1486 unsigned int npeel_tmp;
1487 bool negative = tree_int_cst_compare (DR_STEP (dr),
1488 size_zero_node) < 0;
1489
1490 /* Save info about DR in the hash table. */
1491 vectype = STMT_VINFO_VECTYPE (stmt_info);
1492 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1493 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1494 TREE_TYPE (DR_REF (dr))));
1495 npeel_tmp = (negative
1496 ? (mis - nelements) : (nelements - mis))
1497 & (nelements - 1);
1498
1499 /* For multiple types, it is possible that the bigger type access
1500 will have more than one peeling option. E.g., a loop with two
1501 types: one of size (vector size / 4), and the other one of
1502 size (vector size / 8). Vectorization factor will 8. If both
1503 access are misaligned by 3, the first one needs one scalar
1504 iteration to be aligned, and the second one needs 5. But the
1505 the first one will be aligned also by peeling 5 scalar
1506 iterations, and in that case both accesses will be aligned.
1507 Hence, except for the immediate peeling amount, we also want
1508 to try to add full vector size, while we don't exceed
1509 vectorization factor.
1510 We do this automtically for cost model, since we calculate cost
1511 for every peeling option. */
1512 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1513 {
1514 if (STMT_SLP_TYPE (stmt_info))
1515 possible_npeel_number
1516 = (vf * GROUP_SIZE (stmt_info)) / nelements;
1517 else
1518 possible_npeel_number = vf / nelements;
1519 }
1520
1521 /* Handle the aligned case. We may decide to align some other
1522 access, making DR unaligned. */
1523 if (DR_MISALIGNMENT (dr) == 0)
1524 {
1525 npeel_tmp = 0;
1526 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
1527 possible_npeel_number++;
1528 }
1529
1530 for (j = 0; j < possible_npeel_number; j++)
1531 {
1532 vect_peeling_hash_insert (&peeling_htab, loop_vinfo,
1533 dr, npeel_tmp);
1534 npeel_tmp += nelements;
1535 }
1536
1537 all_misalignments_unknown = false;
1538 /* Data-ref that was chosen for the case that all the
1539 misalignments are unknown is not relevant anymore, since we
1540 have a data-ref with known alignment. */
1541 dr0 = NULL;
1542 }
1543 else
1544 {
1545 /* If we don't know any misalignment values, we prefer
1546 peeling for data-ref that has the maximum number of data-refs
1547 with the same alignment, unless the target prefers to align
1548 stores over load. */
1549 if (all_misalignments_unknown)
1550 {
1551 unsigned same_align_drs
1552 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1553 if (!dr0
1554 || same_align_drs_max < same_align_drs)
1555 {
1556 same_align_drs_max = same_align_drs;
1557 dr0 = dr;
1558 }
1559 /* For data-refs with the same number of related
1560 accesses prefer the one where the misalign
1561 computation will be invariant in the outermost loop. */
1562 else if (same_align_drs_max == same_align_drs)
1563 {
1564 struct loop *ivloop0, *ivloop;
1565 ivloop0 = outermost_invariant_loop_for_expr
1566 (loop, DR_BASE_ADDRESS (dr0));
1567 ivloop = outermost_invariant_loop_for_expr
1568 (loop, DR_BASE_ADDRESS (dr));
1569 if ((ivloop && !ivloop0)
1570 || (ivloop && ivloop0
1571 && flow_loop_nested_p (ivloop, ivloop0)))
1572 dr0 = dr;
1573 }
1574
1575 if (!first_store && DR_IS_WRITE (dr))
1576 first_store = dr;
1577 }
1578
1579 /* If there are both known and unknown misaligned accesses in the
1580 loop, we choose peeling amount according to the known
1581 accesses. */
1582 if (!supportable_dr_alignment)
1583 {
1584 dr0 = dr;
1585 if (!first_store && DR_IS_WRITE (dr))
1586 first_store = dr;
1587 }
1588 }
1589 }
1590 else
1591 {
1592 if (!aligned_access_p (dr))
1593 {
1594 if (dump_enabled_p ())
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "vector alignment may not be reachable\n");
1597 break;
1598 }
1599 }
1600 }
1601
1602 /* Check if we can possibly peel the loop. */
1603 if (!vect_can_advance_ivs_p (loop_vinfo)
1604 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop))
1605 || loop->inner)
1606 do_peeling = false;
1607
1608 if (do_peeling
1609 && all_misalignments_unknown
1610 && vect_supportable_dr_alignment (dr0, false))
1611 {
1612 /* Check if the target requires to prefer stores over loads, i.e., if
1613 misaligned stores are more expensive than misaligned loads (taking
1614 drs with same alignment into account). */
1615 if (first_store && DR_IS_READ (dr0))
1616 {
1617 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1618 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1619 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1620 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1621 stmt_vector_for_cost dummy;
1622 dummy.create (2);
1623
1624 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1625 &dummy);
1626 vect_get_data_access_cost (first_store, &store_inside_cost,
1627 &store_outside_cost, &dummy);
1628
1629 dummy.release ();
1630
1631 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1632 aligning the load DR0). */
1633 load_inside_penalty = store_inside_cost;
1634 load_outside_penalty = store_outside_cost;
1635 for (i = 0;
1636 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1637 DR_STMT (first_store))).iterate (i, &dr);
1638 i++)
1639 if (DR_IS_READ (dr))
1640 {
1641 load_inside_penalty += load_inside_cost;
1642 load_outside_penalty += load_outside_cost;
1643 }
1644 else
1645 {
1646 load_inside_penalty += store_inside_cost;
1647 load_outside_penalty += store_outside_cost;
1648 }
1649
1650 /* Calculate the penalty for leaving DR0 unaligned (by
1651 aligning the FIRST_STORE). */
1652 store_inside_penalty = load_inside_cost;
1653 store_outside_penalty = load_outside_cost;
1654 for (i = 0;
1655 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1656 DR_STMT (dr0))).iterate (i, &dr);
1657 i++)
1658 if (DR_IS_READ (dr))
1659 {
1660 store_inside_penalty += load_inside_cost;
1661 store_outside_penalty += load_outside_cost;
1662 }
1663 else
1664 {
1665 store_inside_penalty += store_inside_cost;
1666 store_outside_penalty += store_outside_cost;
1667 }
1668
1669 if (load_inside_penalty > store_inside_penalty
1670 || (load_inside_penalty == store_inside_penalty
1671 && load_outside_penalty > store_outside_penalty))
1672 dr0 = first_store;
1673 }
1674
1675 /* In case there are only loads with different unknown misalignments, use
1676 peeling only if it may help to align other accesses in the loop or
1677 if it may help improving load bandwith when we'd end up using
1678 unaligned loads. */
1679 tree dr0_vt = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr0)));
1680 if (!first_store
1681 && !STMT_VINFO_SAME_ALIGN_REFS (
1682 vinfo_for_stmt (DR_STMT (dr0))).length ()
1683 && (vect_supportable_dr_alignment (dr0, false)
1684 != dr_unaligned_supported
1685 || (builtin_vectorization_cost (vector_load, dr0_vt, 0)
1686 == builtin_vectorization_cost (unaligned_load, dr0_vt, -1))))
1687 do_peeling = false;
1688 }
1689
1690 if (do_peeling && !dr0)
1691 {
1692 /* Peeling is possible, but there is no data access that is not supported
1693 unless aligned. So we try to choose the best possible peeling. */
1694
1695 /* We should get here only if there are drs with known misalignment. */
1696 gcc_assert (!all_misalignments_unknown);
1697
1698 /* Choose the best peeling from the hash table. */
1699 dr0 = vect_peeling_hash_choose_best_peeling (&peeling_htab,
1700 loop_vinfo, &npeel,
1701 &body_cost_vec);
1702 if (!dr0 || !npeel)
1703 do_peeling = false;
1704 }
1705
1706 if (do_peeling)
1707 {
1708 stmt = DR_STMT (dr0);
1709 stmt_info = vinfo_for_stmt (stmt);
1710 vectype = STMT_VINFO_VECTYPE (stmt_info);
1711 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1712
1713 if (known_alignment_for_access_p (dr0))
1714 {
1715 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1716 size_zero_node) < 0;
1717 if (!npeel)
1718 {
1719 /* Since it's known at compile time, compute the number of
1720 iterations in the peeled loop (the peeling factor) for use in
1721 updating DR_MISALIGNMENT values. The peeling factor is the
1722 vectorization factor minus the misalignment as an element
1723 count. */
1724 mis = DR_MISALIGNMENT (dr0);
1725 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1726 npeel = ((negative ? mis - nelements : nelements - mis)
1727 & (nelements - 1));
1728 }
1729
1730 /* For interleaved data access every iteration accesses all the
1731 members of the group, therefore we divide the number of iterations
1732 by the group size. */
1733 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1734 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1735 npeel /= GROUP_SIZE (stmt_info);
1736
1737 if (dump_enabled_p ())
1738 dump_printf_loc (MSG_NOTE, vect_location,
1739 "Try peeling by %d\n", npeel);
1740 }
1741
1742 /* Ensure that all data refs can be vectorized after the peel. */
1743 FOR_EACH_VEC_ELT (datarefs, i, dr)
1744 {
1745 int save_misalignment;
1746
1747 if (dr == dr0)
1748 continue;
1749
1750 stmt = DR_STMT (dr);
1751 stmt_info = vinfo_for_stmt (stmt);
1752 /* For interleaving, only the alignment of the first access
1753 matters. */
1754 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1755 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1756 continue;
1757
1758 /* Strided accesses perform only component accesses, alignment is
1759 irrelevant for them. */
1760 if (STMT_VINFO_STRIDED_P (stmt_info)
1761 && !STMT_VINFO_GROUPED_ACCESS (stmt_info))
1762 continue;
1763
1764 save_misalignment = DR_MISALIGNMENT (dr);
1765 vect_update_misalignment_for_peel (dr, dr0, npeel);
1766 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1767 SET_DR_MISALIGNMENT (dr, save_misalignment);
1768
1769 if (!supportable_dr_alignment)
1770 {
1771 do_peeling = false;
1772 break;
1773 }
1774 }
1775
1776 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1777 {
1778 stat = vect_verify_datarefs_alignment (loop_vinfo);
1779 if (!stat)
1780 do_peeling = false;
1781 else
1782 {
1783 body_cost_vec.release ();
1784 return stat;
1785 }
1786 }
1787
1788 /* Cost model #1 - honor --param vect-max-peeling-for-alignment. */
1789 if (do_peeling)
1790 {
1791 unsigned max_allowed_peel
1792 = PARAM_VALUE (PARAM_VECT_MAX_PEELING_FOR_ALIGNMENT);
1793 if (max_allowed_peel != (unsigned)-1)
1794 {
1795 unsigned max_peel = npeel;
1796 if (max_peel == 0)
1797 {
1798 gimple *dr_stmt = DR_STMT (dr0);
1799 stmt_vec_info vinfo = vinfo_for_stmt (dr_stmt);
1800 tree vtype = STMT_VINFO_VECTYPE (vinfo);
1801 max_peel = TYPE_VECTOR_SUBPARTS (vtype) - 1;
1802 }
1803 if (max_peel > max_allowed_peel)
1804 {
1805 do_peeling = false;
1806 if (dump_enabled_p ())
1807 dump_printf_loc (MSG_NOTE, vect_location,
1808 "Disable peeling, max peels reached: %d\n", max_peel);
1809 }
1810 }
1811 }
1812
1813 /* Cost model #2 - if peeling may result in a remaining loop not
1814 iterating enough to be vectorized then do not peel. */
1815 if (do_peeling
1816 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1817 {
1818 unsigned max_peel
1819 = npeel == 0 ? LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1 : npeel;
1820 if (LOOP_VINFO_INT_NITERS (loop_vinfo)
1821 < LOOP_VINFO_VECT_FACTOR (loop_vinfo) + max_peel)
1822 do_peeling = false;
1823 }
1824
1825 if (do_peeling)
1826 {
1827 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1828 If the misalignment of DR_i is identical to that of dr0 then set
1829 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1830 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1831 by the peeling factor times the element size of DR_i (MOD the
1832 vectorization factor times the size). Otherwise, the
1833 misalignment of DR_i must be set to unknown. */
1834 FOR_EACH_VEC_ELT (datarefs, i, dr)
1835 if (dr != dr0)
1836 vect_update_misalignment_for_peel (dr, dr0, npeel);
1837
1838 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1839 if (npeel)
1840 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1841 else
1842 LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1843 = DR_MISALIGNMENT (dr0);
1844 SET_DR_MISALIGNMENT (dr0, 0);
1845 if (dump_enabled_p ())
1846 {
1847 dump_printf_loc (MSG_NOTE, vect_location,
1848 "Alignment of access forced using peeling.\n");
1849 dump_printf_loc (MSG_NOTE, vect_location,
1850 "Peeling for alignment will be applied.\n");
1851 }
1852 /* The inside-loop cost will be accounted for in vectorizable_load
1853 and vectorizable_store correctly with adjusted alignments.
1854 Drop the body_cst_vec on the floor here. */
1855 body_cost_vec.release ();
1856
1857 stat = vect_verify_datarefs_alignment (loop_vinfo);
1858 gcc_assert (stat);
1859 return stat;
1860 }
1861 }
1862
1863 body_cost_vec.release ();
1864
1865 /* (2) Versioning to force alignment. */
1866
1867 /* Try versioning if:
1868 1) optimize loop for speed
1869 2) there is at least one unsupported misaligned data ref with an unknown
1870 misalignment, and
1871 3) all misaligned data refs with a known misalignment are supported, and
1872 4) the number of runtime alignment checks is within reason. */
1873
1874 do_versioning =
1875 optimize_loop_nest_for_speed_p (loop)
1876 && (!loop->inner); /* FORNOW */
1877
1878 if (do_versioning)
1879 {
1880 FOR_EACH_VEC_ELT (datarefs, i, dr)
1881 {
1882 stmt = DR_STMT (dr);
1883 stmt_info = vinfo_for_stmt (stmt);
1884
1885 /* For interleaving, only the alignment of the first access
1886 matters. */
1887 if (aligned_access_p (dr)
1888 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1889 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1890 continue;
1891
1892 if (STMT_VINFO_STRIDED_P (stmt_info))
1893 {
1894 /* Strided loads perform only component accesses, alignment is
1895 irrelevant for them. */
1896 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
1897 continue;
1898 do_versioning = false;
1899 break;
1900 }
1901
1902 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1903
1904 if (!supportable_dr_alignment)
1905 {
1906 gimple *stmt;
1907 int mask;
1908 tree vectype;
1909
1910 if (known_alignment_for_access_p (dr)
1911 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1912 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1913 {
1914 do_versioning = false;
1915 break;
1916 }
1917
1918 stmt = DR_STMT (dr);
1919 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1920 gcc_assert (vectype);
1921
1922 /* The rightmost bits of an aligned address must be zeros.
1923 Construct the mask needed for this test. For example,
1924 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1925 mask must be 15 = 0xf. */
1926 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1927
1928 /* FORNOW: use the same mask to test all potentially unaligned
1929 references in the loop. The vectorizer currently supports
1930 a single vector size, see the reference to
1931 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1932 vectorization factor is computed. */
1933 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1934 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1935 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1936 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1937 DR_STMT (dr));
1938 }
1939 }
1940
1941 /* Versioning requires at least one misaligned data reference. */
1942 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1943 do_versioning = false;
1944 else if (!do_versioning)
1945 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1946 }
1947
1948 if (do_versioning)
1949 {
1950 vec<gimple *> may_misalign_stmts
1951 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1952 gimple *stmt;
1953
1954 /* It can now be assumed that the data references in the statements
1955 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1956 of the loop being vectorized. */
1957 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1958 {
1959 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1960 dr = STMT_VINFO_DATA_REF (stmt_info);
1961 SET_DR_MISALIGNMENT (dr, 0);
1962 if (dump_enabled_p ())
1963 dump_printf_loc (MSG_NOTE, vect_location,
1964 "Alignment of access forced using versioning.\n");
1965 }
1966
1967 if (dump_enabled_p ())
1968 dump_printf_loc (MSG_NOTE, vect_location,
1969 "Versioning for alignment will be applied.\n");
1970
1971 /* Peeling and versioning can't be done together at this time. */
1972 gcc_assert (! (do_peeling && do_versioning));
1973
1974 stat = vect_verify_datarefs_alignment (loop_vinfo);
1975 gcc_assert (stat);
1976 return stat;
1977 }
1978
1979 /* This point is reached if neither peeling nor versioning is being done. */
1980 gcc_assert (! (do_peeling || do_versioning));
1981
1982 stat = vect_verify_datarefs_alignment (loop_vinfo);
1983 return stat;
1984 }
1985
1986
1987 /* Function vect_find_same_alignment_drs.
1988
1989 Update group and alignment relations according to the chosen
1990 vectorization factor. */
1991
1992 static void
1993 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1994 loop_vec_info loop_vinfo)
1995 {
1996 unsigned int i;
1997 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1998 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1999 struct data_reference *dra = DDR_A (ddr);
2000 struct data_reference *drb = DDR_B (ddr);
2001 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2002 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2003 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2004 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2005 lambda_vector dist_v;
2006 unsigned int loop_depth;
2007
2008 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2009 return;
2010
2011 if (dra == drb)
2012 return;
2013
2014 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2015 return;
2016
2017 /* Loop-based vectorization and known data dependence. */
2018 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2019 return;
2020
2021 /* Data-dependence analysis reports a distance vector of zero
2022 for data-references that overlap only in the first iteration
2023 but have different sign step (see PR45764).
2024 So as a sanity check require equal DR_STEP. */
2025 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2026 return;
2027
2028 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2029 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
2030 {
2031 int dist = dist_v[loop_depth];
2032
2033 if (dump_enabled_p ())
2034 dump_printf_loc (MSG_NOTE, vect_location,
2035 "dependence distance = %d.\n", dist);
2036
2037 /* Same loop iteration. */
2038 if (dist == 0
2039 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2040 {
2041 /* Two references with distance zero have the same alignment. */
2042 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
2043 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
2044 if (dump_enabled_p ())
2045 {
2046 dump_printf_loc (MSG_NOTE, vect_location,
2047 "accesses have the same alignment.\n");
2048 dump_printf (MSG_NOTE,
2049 "dependence distance modulo vf == 0 between ");
2050 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2051 dump_printf (MSG_NOTE, " and ");
2052 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2053 dump_printf (MSG_NOTE, "\n");
2054 }
2055 }
2056 }
2057 }
2058
2059
2060 /* Function vect_analyze_data_refs_alignment
2061
2062 Analyze the alignment of the data-references in the loop.
2063 Return FALSE if a data reference is found that cannot be vectorized. */
2064
2065 bool
2066 vect_analyze_data_refs_alignment (vec_info *vinfo)
2067 {
2068 if (dump_enabled_p ())
2069 dump_printf_loc (MSG_NOTE, vect_location,
2070 "=== vect_analyze_data_refs_alignment ===\n");
2071
2072 /* Mark groups of data references with same alignment using
2073 data dependence information. */
2074 if (is_a <loop_vec_info> (vinfo))
2075 {
2076 vec<ddr_p> ddrs = vinfo->ddrs;
2077 struct data_dependence_relation *ddr;
2078 unsigned int i;
2079
2080 FOR_EACH_VEC_ELT (ddrs, i, ddr)
2081 vect_find_same_alignment_drs (ddr, as_a <loop_vec_info> (vinfo));
2082 }
2083
2084 if (!vect_compute_data_refs_alignment (vinfo))
2085 {
2086 if (dump_enabled_p ())
2087 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2088 "not vectorized: can't calculate alignment "
2089 "for data ref.\n");
2090 return false;
2091 }
2092
2093 return true;
2094 }
2095
2096
2097 /* Analyze groups of accesses: check that DR belongs to a group of
2098 accesses of legal size, step, etc. Detect gaps, single element
2099 interleaving, and other special cases. Set grouped access info.
2100 Collect groups of strided stores for further use in SLP analysis.
2101 Worker for vect_analyze_group_access. */
2102
2103 static bool
2104 vect_analyze_group_access_1 (struct data_reference *dr)
2105 {
2106 tree step = DR_STEP (dr);
2107 tree scalar_type = TREE_TYPE (DR_REF (dr));
2108 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2109 gimple *stmt = DR_STMT (dr);
2110 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2111 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2112 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2113 HOST_WIDE_INT dr_step = -1;
2114 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2115 bool slp_impossible = false;
2116 struct loop *loop = NULL;
2117
2118 if (loop_vinfo)
2119 loop = LOOP_VINFO_LOOP (loop_vinfo);
2120
2121 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2122 size of the interleaving group (including gaps). */
2123 if (tree_fits_shwi_p (step))
2124 {
2125 dr_step = tree_to_shwi (step);
2126 groupsize = absu_hwi (dr_step) / type_size;
2127 }
2128 else
2129 groupsize = 0;
2130
2131 /* Not consecutive access is possible only if it is a part of interleaving. */
2132 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2133 {
2134 /* Check if it this DR is a part of interleaving, and is a single
2135 element of the group that is accessed in the loop. */
2136
2137 /* Gaps are supported only for loads. STEP must be a multiple of the type
2138 size. The size of the group must be a power of 2. */
2139 if (DR_IS_READ (dr)
2140 && (dr_step % type_size) == 0
2141 && groupsize > 0
2142 && exact_log2 (groupsize) != -1)
2143 {
2144 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2145 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2146 if (dump_enabled_p ())
2147 {
2148 dump_printf_loc (MSG_NOTE, vect_location,
2149 "Detected single element interleaving ");
2150 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2151 dump_printf (MSG_NOTE, " step ");
2152 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2153 dump_printf (MSG_NOTE, "\n");
2154 }
2155
2156 if (loop_vinfo)
2157 {
2158 if (dump_enabled_p ())
2159 dump_printf_loc (MSG_NOTE, vect_location,
2160 "Data access with gaps requires scalar "
2161 "epilogue loop\n");
2162 if (loop->inner)
2163 {
2164 if (dump_enabled_p ())
2165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166 "Peeling for outer loop is not"
2167 " supported\n");
2168 return false;
2169 }
2170
2171 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2172 }
2173
2174 return true;
2175 }
2176
2177 if (dump_enabled_p ())
2178 {
2179 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2180 "not consecutive access ");
2181 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2182 }
2183
2184 if (bb_vinfo)
2185 {
2186 /* Mark the statement as unvectorizable. */
2187 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2188 return true;
2189 }
2190
2191 dump_printf_loc (MSG_NOTE, vect_location, "using strided accesses\n");
2192 STMT_VINFO_STRIDED_P (stmt_info) = true;
2193 return true;
2194 }
2195
2196 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2197 {
2198 /* First stmt in the interleaving chain. Check the chain. */
2199 gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2200 struct data_reference *data_ref = dr;
2201 unsigned int count = 1;
2202 tree prev_init = DR_INIT (data_ref);
2203 gimple *prev = stmt;
2204 HOST_WIDE_INT diff, gaps = 0;
2205
2206 while (next)
2207 {
2208 /* Skip same data-refs. In case that two or more stmts share
2209 data-ref (supported only for loads), we vectorize only the first
2210 stmt, and the rest get their vectorized loads from the first
2211 one. */
2212 if (!tree_int_cst_compare (DR_INIT (data_ref),
2213 DR_INIT (STMT_VINFO_DATA_REF (
2214 vinfo_for_stmt (next)))))
2215 {
2216 if (DR_IS_WRITE (data_ref))
2217 {
2218 if (dump_enabled_p ())
2219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220 "Two store stmts share the same dr.\n");
2221 return false;
2222 }
2223
2224 if (dump_enabled_p ())
2225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2226 "Two or more load stmts share the same dr.\n");
2227
2228 /* For load use the same data-ref load. */
2229 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2230
2231 prev = next;
2232 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2233 continue;
2234 }
2235
2236 prev = next;
2237 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2238
2239 /* All group members have the same STEP by construction. */
2240 gcc_checking_assert (operand_equal_p (DR_STEP (data_ref), step, 0));
2241
2242 /* Check that the distance between two accesses is equal to the type
2243 size. Otherwise, we have gaps. */
2244 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2245 - TREE_INT_CST_LOW (prev_init)) / type_size;
2246 if (diff != 1)
2247 {
2248 /* FORNOW: SLP of accesses with gaps is not supported. */
2249 slp_impossible = true;
2250 if (DR_IS_WRITE (data_ref))
2251 {
2252 if (dump_enabled_p ())
2253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2254 "interleaved store with gaps\n");
2255 return false;
2256 }
2257
2258 gaps += diff - 1;
2259 }
2260
2261 last_accessed_element += diff;
2262
2263 /* Store the gap from the previous member of the group. If there is no
2264 gap in the access, GROUP_GAP is always 1. */
2265 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2266
2267 prev_init = DR_INIT (data_ref);
2268 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2269 /* Count the number of data-refs in the chain. */
2270 count++;
2271 }
2272
2273 if (groupsize == 0)
2274 groupsize = count + gaps;
2275
2276 if (groupsize > UINT_MAX)
2277 {
2278 if (dump_enabled_p ())
2279 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280 "group is too large\n");
2281 return false;
2282 }
2283
2284 /* Check that the size of the interleaving is equal to count for stores,
2285 i.e., that there are no gaps. */
2286 if (groupsize != count
2287 && !DR_IS_READ (dr))
2288 {
2289 if (dump_enabled_p ())
2290 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2291 "interleaved store with gaps\n");
2292 return false;
2293 }
2294
2295 /* If there is a gap after the last load in the group it is the
2296 difference between the groupsize and the last accessed
2297 element.
2298 When there is no gap, this difference should be 0. */
2299 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - last_accessed_element;
2300
2301 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2302 if (dump_enabled_p ())
2303 {
2304 dump_printf_loc (MSG_NOTE, vect_location,
2305 "Detected interleaving ");
2306 if (DR_IS_READ (dr))
2307 dump_printf (MSG_NOTE, "load ");
2308 else
2309 dump_printf (MSG_NOTE, "store ");
2310 dump_printf (MSG_NOTE, "of size %u starting with ",
2311 (unsigned)groupsize);
2312 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
2313 if (GROUP_GAP (vinfo_for_stmt (stmt)) != 0)
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "There is a gap of %u elements after the group\n",
2316 GROUP_GAP (vinfo_for_stmt (stmt)));
2317 }
2318
2319 /* SLP: create an SLP data structure for every interleaving group of
2320 stores for further analysis in vect_analyse_slp. */
2321 if (DR_IS_WRITE (dr) && !slp_impossible)
2322 {
2323 if (loop_vinfo)
2324 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2325 if (bb_vinfo)
2326 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2327 }
2328
2329 /* If there is a gap in the end of the group or the group size cannot
2330 be made a multiple of the vector element count then we access excess
2331 elements in the last iteration and thus need to peel that off. */
2332 if (loop_vinfo
2333 && (groupsize - last_accessed_element > 0
2334 || exact_log2 (groupsize) == -1))
2335
2336 {
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 "Data access with gaps requires scalar "
2340 "epilogue loop\n");
2341 if (loop->inner)
2342 {
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "Peeling for outer loop is not supported\n");
2346 return false;
2347 }
2348
2349 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2350 }
2351 }
2352
2353 return true;
2354 }
2355
2356 /* Analyze groups of accesses: check that DR belongs to a group of
2357 accesses of legal size, step, etc. Detect gaps, single element
2358 interleaving, and other special cases. Set grouped access info.
2359 Collect groups of strided stores for further use in SLP analysis. */
2360
2361 static bool
2362 vect_analyze_group_access (struct data_reference *dr)
2363 {
2364 if (!vect_analyze_group_access_1 (dr))
2365 {
2366 /* Dissolve the group if present. */
2367 gimple *next;
2368 gimple *stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dr)));
2369 while (stmt)
2370 {
2371 stmt_vec_info vinfo = vinfo_for_stmt (stmt);
2372 next = GROUP_NEXT_ELEMENT (vinfo);
2373 GROUP_FIRST_ELEMENT (vinfo) = NULL;
2374 GROUP_NEXT_ELEMENT (vinfo) = NULL;
2375 stmt = next;
2376 }
2377 return false;
2378 }
2379 return true;
2380 }
2381
2382 /* Analyze the access pattern of the data-reference DR.
2383 In case of non-consecutive accesses call vect_analyze_group_access() to
2384 analyze groups of accesses. */
2385
2386 static bool
2387 vect_analyze_data_ref_access (struct data_reference *dr)
2388 {
2389 tree step = DR_STEP (dr);
2390 tree scalar_type = TREE_TYPE (DR_REF (dr));
2391 gimple *stmt = DR_STMT (dr);
2392 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2393 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2394 struct loop *loop = NULL;
2395
2396 if (loop_vinfo)
2397 loop = LOOP_VINFO_LOOP (loop_vinfo);
2398
2399 if (loop_vinfo && !step)
2400 {
2401 if (dump_enabled_p ())
2402 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2403 "bad data-ref access in loop\n");
2404 return false;
2405 }
2406
2407 /* Allow loads with zero step in inner-loop vectorization. */
2408 if (loop_vinfo && integer_zerop (step))
2409 {
2410 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2411 if (!nested_in_vect_loop_p (loop, stmt))
2412 return DR_IS_READ (dr);
2413 /* Allow references with zero step for outer loops marked
2414 with pragma omp simd only - it guarantees absence of
2415 loop-carried dependencies between inner loop iterations. */
2416 if (!loop->force_vectorize)
2417 {
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location,
2420 "zero step in inner loop of nest\n");
2421 return false;
2422 }
2423 }
2424
2425 if (loop && nested_in_vect_loop_p (loop, stmt))
2426 {
2427 /* Interleaved accesses are not yet supported within outer-loop
2428 vectorization for references in the inner-loop. */
2429 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2430
2431 /* For the rest of the analysis we use the outer-loop step. */
2432 step = STMT_VINFO_DR_STEP (stmt_info);
2433 if (integer_zerop (step))
2434 {
2435 if (dump_enabled_p ())
2436 dump_printf_loc (MSG_NOTE, vect_location,
2437 "zero step in outer loop.\n");
2438 return DR_IS_READ (dr);
2439 }
2440 }
2441
2442 /* Consecutive? */
2443 if (TREE_CODE (step) == INTEGER_CST)
2444 {
2445 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2446 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2447 || (dr_step < 0
2448 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2449 {
2450 /* Mark that it is not interleaving. */
2451 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2452 return true;
2453 }
2454 }
2455
2456 if (loop && nested_in_vect_loop_p (loop, stmt))
2457 {
2458 if (dump_enabled_p ())
2459 dump_printf_loc (MSG_NOTE, vect_location,
2460 "grouped access in outer loop.\n");
2461 return false;
2462 }
2463
2464
2465 /* Assume this is a DR handled by non-constant strided load case. */
2466 if (TREE_CODE (step) != INTEGER_CST)
2467 return (STMT_VINFO_STRIDED_P (stmt_info)
2468 && (!STMT_VINFO_GROUPED_ACCESS (stmt_info)
2469 || vect_analyze_group_access (dr)));
2470
2471 /* Not consecutive access - check if it's a part of interleaving group. */
2472 return vect_analyze_group_access (dr);
2473 }
2474
2475
2476
2477 /* A helper function used in the comparator function to sort data
2478 references. T1 and T2 are two data references to be compared.
2479 The function returns -1, 0, or 1. */
2480
2481 static int
2482 compare_tree (tree t1, tree t2)
2483 {
2484 int i, cmp;
2485 enum tree_code code;
2486 char tclass;
2487
2488 if (t1 == t2)
2489 return 0;
2490 if (t1 == NULL)
2491 return -1;
2492 if (t2 == NULL)
2493 return 1;
2494
2495
2496 if (TREE_CODE (t1) != TREE_CODE (t2))
2497 return TREE_CODE (t1) < TREE_CODE (t2) ? -1 : 1;
2498
2499 code = TREE_CODE (t1);
2500 switch (code)
2501 {
2502 /* For const values, we can just use hash values for comparisons. */
2503 case INTEGER_CST:
2504 case REAL_CST:
2505 case FIXED_CST:
2506 case STRING_CST:
2507 case COMPLEX_CST:
2508 case VECTOR_CST:
2509 {
2510 hashval_t h1 = iterative_hash_expr (t1, 0);
2511 hashval_t h2 = iterative_hash_expr (t2, 0);
2512 if (h1 != h2)
2513 return h1 < h2 ? -1 : 1;
2514 break;
2515 }
2516
2517 case SSA_NAME:
2518 cmp = compare_tree (SSA_NAME_VAR (t1), SSA_NAME_VAR (t2));
2519 if (cmp != 0)
2520 return cmp;
2521
2522 if (SSA_NAME_VERSION (t1) != SSA_NAME_VERSION (t2))
2523 return SSA_NAME_VERSION (t1) < SSA_NAME_VERSION (t2) ? -1 : 1;
2524 break;
2525
2526 default:
2527 tclass = TREE_CODE_CLASS (code);
2528
2529 /* For var-decl, we could compare their UIDs. */
2530 if (tclass == tcc_declaration)
2531 {
2532 if (DECL_UID (t1) != DECL_UID (t2))
2533 return DECL_UID (t1) < DECL_UID (t2) ? -1 : 1;
2534 break;
2535 }
2536
2537 /* For expressions with operands, compare their operands recursively. */
2538 for (i = TREE_OPERAND_LENGTH (t1) - 1; i >= 0; --i)
2539 {
2540 cmp = compare_tree (TREE_OPERAND (t1, i), TREE_OPERAND (t2, i));
2541 if (cmp != 0)
2542 return cmp;
2543 }
2544 }
2545
2546 return 0;
2547 }
2548
2549
2550 /* Compare two data-references DRA and DRB to group them into chunks
2551 suitable for grouping. */
2552
2553 static int
2554 dr_group_sort_cmp (const void *dra_, const void *drb_)
2555 {
2556 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2557 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2558 int cmp;
2559
2560 /* Stabilize sort. */
2561 if (dra == drb)
2562 return 0;
2563
2564 /* Ordering of DRs according to base. */
2565 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2566 {
2567 cmp = compare_tree (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb));
2568 if (cmp != 0)
2569 return cmp;
2570 }
2571
2572 /* And according to DR_OFFSET. */
2573 if (!dr_equal_offsets_p (dra, drb))
2574 {
2575 cmp = compare_tree (DR_OFFSET (dra), DR_OFFSET (drb));
2576 if (cmp != 0)
2577 return cmp;
2578 }
2579
2580 /* Put reads before writes. */
2581 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2582 return DR_IS_READ (dra) ? -1 : 1;
2583
2584 /* Then sort after access size. */
2585 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2586 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2587 {
2588 cmp = compare_tree (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2589 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
2590 if (cmp != 0)
2591 return cmp;
2592 }
2593
2594 /* And after step. */
2595 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2596 {
2597 cmp = compare_tree (DR_STEP (dra), DR_STEP (drb));
2598 if (cmp != 0)
2599 return cmp;
2600 }
2601
2602 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2603 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2604 if (cmp == 0)
2605 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2606 return cmp;
2607 }
2608
2609 /* Function vect_analyze_data_ref_accesses.
2610
2611 Analyze the access pattern of all the data references in the loop.
2612
2613 FORNOW: the only access pattern that is considered vectorizable is a
2614 simple step 1 (consecutive) access.
2615
2616 FORNOW: handle only arrays and pointer accesses. */
2617
2618 bool
2619 vect_analyze_data_ref_accesses (vec_info *vinfo)
2620 {
2621 unsigned int i;
2622 vec<data_reference_p> datarefs = vinfo->datarefs;
2623 struct data_reference *dr;
2624
2625 if (dump_enabled_p ())
2626 dump_printf_loc (MSG_NOTE, vect_location,
2627 "=== vect_analyze_data_ref_accesses ===\n");
2628
2629 if (datarefs.is_empty ())
2630 return true;
2631
2632 /* Sort the array of datarefs to make building the interleaving chains
2633 linear. Don't modify the original vector's order, it is needed for
2634 determining what dependencies are reversed. */
2635 vec<data_reference_p> datarefs_copy = datarefs.copy ();
2636 datarefs_copy.qsort (dr_group_sort_cmp);
2637
2638 /* Build the interleaving chains. */
2639 for (i = 0; i < datarefs_copy.length () - 1;)
2640 {
2641 data_reference_p dra = datarefs_copy[i];
2642 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2643 stmt_vec_info lastinfo = NULL;
2644 for (i = i + 1; i < datarefs_copy.length (); ++i)
2645 {
2646 data_reference_p drb = datarefs_copy[i];
2647 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2648
2649 /* ??? Imperfect sorting (non-compatible types, non-modulo
2650 accesses, same accesses) can lead to a group to be artificially
2651 split here as we don't just skip over those. If it really
2652 matters we can push those to a worklist and re-iterate
2653 over them. The we can just skip ahead to the next DR here. */
2654
2655 /* Check that the data-refs have same first location (except init)
2656 and they are both either store or load (not load and store,
2657 not masked loads or stores). */
2658 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2659 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2660 DR_BASE_ADDRESS (drb), 0)
2661 || !dr_equal_offsets_p (dra, drb)
2662 || !gimple_assign_single_p (DR_STMT (dra))
2663 || !gimple_assign_single_p (DR_STMT (drb)))
2664 break;
2665
2666 /* Check that the data-refs have the same constant size. */
2667 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2668 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2669 if (!tree_fits_uhwi_p (sza)
2670 || !tree_fits_uhwi_p (szb)
2671 || !tree_int_cst_equal (sza, szb))
2672 break;
2673
2674 /* Check that the data-refs have the same step. */
2675 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2676 break;
2677
2678 /* Do not place the same access in the interleaving chain twice. */
2679 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2680 break;
2681
2682 /* Check the types are compatible.
2683 ??? We don't distinguish this during sorting. */
2684 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2685 TREE_TYPE (DR_REF (drb))))
2686 break;
2687
2688 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2689 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2690 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2691 gcc_assert (init_a < init_b);
2692
2693 /* If init_b == init_a + the size of the type * k, we have an
2694 interleaving, and DRA is accessed before DRB. */
2695 HOST_WIDE_INT type_size_a = tree_to_uhwi (sza);
2696 if ((init_b - init_a) % type_size_a != 0)
2697 break;
2698
2699 /* If we have a store, the accesses are adjacent. This splits
2700 groups into chunks we support (we don't support vectorization
2701 of stores with gaps). */
2702 if (!DR_IS_READ (dra)
2703 && (init_b - (HOST_WIDE_INT) TREE_INT_CST_LOW
2704 (DR_INIT (datarefs_copy[i-1]))
2705 != type_size_a))
2706 break;
2707
2708 /* If the step (if not zero or non-constant) is greater than the
2709 difference between data-refs' inits this splits groups into
2710 suitable sizes. */
2711 if (tree_fits_shwi_p (DR_STEP (dra)))
2712 {
2713 HOST_WIDE_INT step = tree_to_shwi (DR_STEP (dra));
2714 if (step != 0 && step <= (init_b - init_a))
2715 break;
2716 }
2717
2718 if (dump_enabled_p ())
2719 {
2720 dump_printf_loc (MSG_NOTE, vect_location,
2721 "Detected interleaving ");
2722 if (DR_IS_READ (dra))
2723 dump_printf (MSG_NOTE, "load ");
2724 else
2725 dump_printf (MSG_NOTE, "store ");
2726 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2727 dump_printf (MSG_NOTE, " and ");
2728 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2729 dump_printf (MSG_NOTE, "\n");
2730 }
2731
2732 /* Link the found element into the group list. */
2733 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2734 {
2735 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2736 lastinfo = stmtinfo_a;
2737 }
2738 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2739 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2740 lastinfo = stmtinfo_b;
2741 }
2742 }
2743
2744 FOR_EACH_VEC_ELT (datarefs_copy, i, dr)
2745 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2746 && !vect_analyze_data_ref_access (dr))
2747 {
2748 if (dump_enabled_p ())
2749 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2750 "not vectorized: complicated access pattern.\n");
2751
2752 if (is_a <bb_vec_info> (vinfo))
2753 {
2754 /* Mark the statement as not vectorizable. */
2755 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2756 continue;
2757 }
2758 else
2759 {
2760 datarefs_copy.release ();
2761 return false;
2762 }
2763 }
2764
2765 datarefs_copy.release ();
2766 return true;
2767 }
2768
2769
2770 /* Operator == between two dr_with_seg_len objects.
2771
2772 This equality operator is used to make sure two data refs
2773 are the same one so that we will consider to combine the
2774 aliasing checks of those two pairs of data dependent data
2775 refs. */
2776
2777 static bool
2778 operator == (const dr_with_seg_len& d1,
2779 const dr_with_seg_len& d2)
2780 {
2781 return operand_equal_p (DR_BASE_ADDRESS (d1.dr),
2782 DR_BASE_ADDRESS (d2.dr), 0)
2783 && compare_tree (d1.offset, d2.offset) == 0
2784 && compare_tree (d1.seg_len, d2.seg_len) == 0;
2785 }
2786
2787 /* Function comp_dr_with_seg_len_pair.
2788
2789 Comparison function for sorting objects of dr_with_seg_len_pair_t
2790 so that we can combine aliasing checks in one scan. */
2791
2792 static int
2793 comp_dr_with_seg_len_pair (const void *p1_, const void *p2_)
2794 {
2795 const dr_with_seg_len_pair_t* p1 = (const dr_with_seg_len_pair_t *) p1_;
2796 const dr_with_seg_len_pair_t* p2 = (const dr_with_seg_len_pair_t *) p2_;
2797
2798 const dr_with_seg_len &p11 = p1->first,
2799 &p12 = p1->second,
2800 &p21 = p2->first,
2801 &p22 = p2->second;
2802
2803 /* For DR pairs (a, b) and (c, d), we only consider to merge the alias checks
2804 if a and c have the same basic address snd step, and b and d have the same
2805 address and step. Therefore, if any a&c or b&d don't have the same address
2806 and step, we don't care the order of those two pairs after sorting. */
2807 int comp_res;
2808
2809 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p11.dr),
2810 DR_BASE_ADDRESS (p21.dr))) != 0)
2811 return comp_res;
2812 if ((comp_res = compare_tree (DR_BASE_ADDRESS (p12.dr),
2813 DR_BASE_ADDRESS (p22.dr))) != 0)
2814 return comp_res;
2815 if ((comp_res = compare_tree (DR_STEP (p11.dr), DR_STEP (p21.dr))) != 0)
2816 return comp_res;
2817 if ((comp_res = compare_tree (DR_STEP (p12.dr), DR_STEP (p22.dr))) != 0)
2818 return comp_res;
2819 if ((comp_res = compare_tree (p11.offset, p21.offset)) != 0)
2820 return comp_res;
2821 if ((comp_res = compare_tree (p12.offset, p22.offset)) != 0)
2822 return comp_res;
2823
2824 return 0;
2825 }
2826
2827 /* Function vect_vfa_segment_size.
2828
2829 Create an expression that computes the size of segment
2830 that will be accessed for a data reference. The functions takes into
2831 account that realignment loads may access one more vector.
2832
2833 Input:
2834 DR: The data reference.
2835 LENGTH_FACTOR: segment length to consider.
2836
2837 Return an expression whose value is the size of segment which will be
2838 accessed by DR. */
2839
2840 static tree
2841 vect_vfa_segment_size (struct data_reference *dr, tree length_factor)
2842 {
2843 tree segment_length;
2844
2845 if (integer_zerop (DR_STEP (dr)))
2846 segment_length = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
2847 else
2848 segment_length = size_binop (MULT_EXPR,
2849 fold_convert (sizetype, DR_STEP (dr)),
2850 fold_convert (sizetype, length_factor));
2851
2852 if (vect_supportable_dr_alignment (dr, false)
2853 == dr_explicit_realign_optimized)
2854 {
2855 tree vector_size = TYPE_SIZE_UNIT
2856 (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))));
2857
2858 segment_length = size_binop (PLUS_EXPR, segment_length, vector_size);
2859 }
2860 return segment_length;
2861 }
2862
2863 /* Function vect_prune_runtime_alias_test_list.
2864
2865 Prune a list of ddrs to be tested at run-time by versioning for alias.
2866 Merge several alias checks into one if possible.
2867 Return FALSE if resulting list of ddrs is longer then allowed by
2868 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2869
2870 bool
2871 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2872 {
2873 vec<ddr_p> may_alias_ddrs =
2874 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2875 vec<dr_with_seg_len_pair_t>& comp_alias_ddrs =
2876 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2877 int vect_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2878 tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo);
2879
2880 ddr_p ddr;
2881 unsigned int i;
2882 tree length_factor;
2883
2884 if (dump_enabled_p ())
2885 dump_printf_loc (MSG_NOTE, vect_location,
2886 "=== vect_prune_runtime_alias_test_list ===\n");
2887
2888 if (may_alias_ddrs.is_empty ())
2889 return true;
2890
2891 /* Basically, for each pair of dependent data refs store_ptr_0
2892 and load_ptr_0, we create an expression:
2893
2894 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2895 || (load_ptr_0 + load_segment_length_0) <= store_ptr_0))
2896
2897 for aliasing checks. However, in some cases we can decrease
2898 the number of checks by combining two checks into one. For
2899 example, suppose we have another pair of data refs store_ptr_0
2900 and load_ptr_1, and if the following condition is satisfied:
2901
2902 load_ptr_0 < load_ptr_1 &&
2903 load_ptr_1 - load_ptr_0 - load_segment_length_0 < store_segment_length_0
2904
2905 (this condition means, in each iteration of vectorized loop,
2906 the accessed memory of store_ptr_0 cannot be between the memory
2907 of load_ptr_0 and load_ptr_1.)
2908
2909 we then can use only the following expression to finish the
2910 alising checks between store_ptr_0 & load_ptr_0 and
2911 store_ptr_0 & load_ptr_1:
2912
2913 ((store_ptr_0 + store_segment_length_0) <= load_ptr_0)
2914 || (load_ptr_1 + load_segment_length_1 <= store_ptr_0))
2915
2916 Note that we only consider that load_ptr_0 and load_ptr_1 have the
2917 same basic address. */
2918
2919 comp_alias_ddrs.create (may_alias_ddrs.length ());
2920
2921 /* First, we collect all data ref pairs for aliasing checks. */
2922 FOR_EACH_VEC_ELT (may_alias_ddrs, i, ddr)
2923 {
2924 struct data_reference *dr_a, *dr_b;
2925 gimple *dr_group_first_a, *dr_group_first_b;
2926 tree segment_length_a, segment_length_b;
2927 gimple *stmt_a, *stmt_b;
2928
2929 dr_a = DDR_A (ddr);
2930 stmt_a = DR_STMT (DDR_A (ddr));
2931 dr_group_first_a = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_a));
2932 if (dr_group_first_a)
2933 {
2934 stmt_a = dr_group_first_a;
2935 dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a));
2936 }
2937
2938 dr_b = DDR_B (ddr);
2939 stmt_b = DR_STMT (DDR_B (ddr));
2940 dr_group_first_b = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_b));
2941 if (dr_group_first_b)
2942 {
2943 stmt_b = dr_group_first_b;
2944 dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b));
2945 }
2946
2947 if (!operand_equal_p (DR_STEP (dr_a), DR_STEP (dr_b), 0))
2948 length_factor = scalar_loop_iters;
2949 else
2950 length_factor = size_int (vect_factor);
2951 segment_length_a = vect_vfa_segment_size (dr_a, length_factor);
2952 segment_length_b = vect_vfa_segment_size (dr_b, length_factor);
2953
2954 dr_with_seg_len_pair_t dr_with_seg_len_pair
2955 (dr_with_seg_len (dr_a, segment_length_a),
2956 dr_with_seg_len (dr_b, segment_length_b));
2957
2958 if (compare_tree (DR_BASE_ADDRESS (dr_a), DR_BASE_ADDRESS (dr_b)) > 0)
2959 std::swap (dr_with_seg_len_pair.first, dr_with_seg_len_pair.second);
2960
2961 comp_alias_ddrs.safe_push (dr_with_seg_len_pair);
2962 }
2963
2964 /* Second, we sort the collected data ref pairs so that we can scan
2965 them once to combine all possible aliasing checks. */
2966 comp_alias_ddrs.qsort (comp_dr_with_seg_len_pair);
2967
2968 /* Third, we scan the sorted dr pairs and check if we can combine
2969 alias checks of two neighbouring dr pairs. */
2970 for (size_t i = 1; i < comp_alias_ddrs.length (); ++i)
2971 {
2972 /* Deal with two ddrs (dr_a1, dr_b1) and (dr_a2, dr_b2). */
2973 dr_with_seg_len *dr_a1 = &comp_alias_ddrs[i-1].first,
2974 *dr_b1 = &comp_alias_ddrs[i-1].second,
2975 *dr_a2 = &comp_alias_ddrs[i].first,
2976 *dr_b2 = &comp_alias_ddrs[i].second;
2977
2978 /* Remove duplicate data ref pairs. */
2979 if (*dr_a1 == *dr_a2 && *dr_b1 == *dr_b2)
2980 {
2981 if (dump_enabled_p ())
2982 {
2983 dump_printf_loc (MSG_NOTE, vect_location,
2984 "found equal ranges ");
2985 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2986 DR_REF (dr_a1->dr));
2987 dump_printf (MSG_NOTE, ", ");
2988 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2989 DR_REF (dr_b1->dr));
2990 dump_printf (MSG_NOTE, " and ");
2991 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2992 DR_REF (dr_a2->dr));
2993 dump_printf (MSG_NOTE, ", ");
2994 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2995 DR_REF (dr_b2->dr));
2996 dump_printf (MSG_NOTE, "\n");
2997 }
2998
2999 comp_alias_ddrs.ordered_remove (i--);
3000 continue;
3001 }
3002
3003 if (*dr_a1 == *dr_a2 || *dr_b1 == *dr_b2)
3004 {
3005 /* We consider the case that DR_B1 and DR_B2 are same memrefs,
3006 and DR_A1 and DR_A2 are two consecutive memrefs. */
3007 if (*dr_a1 == *dr_a2)
3008 {
3009 std::swap (dr_a1, dr_b1);
3010 std::swap (dr_a2, dr_b2);
3011 }
3012
3013 if (!operand_equal_p (DR_BASE_ADDRESS (dr_a1->dr),
3014 DR_BASE_ADDRESS (dr_a2->dr),
3015 0)
3016 || !tree_fits_shwi_p (dr_a1->offset)
3017 || !tree_fits_shwi_p (dr_a2->offset))
3018 continue;
3019
3020 HOST_WIDE_INT diff = (tree_to_shwi (dr_a2->offset)
3021 - tree_to_shwi (dr_a1->offset));
3022
3023
3024 /* Now we check if the following condition is satisfied:
3025
3026 DIFF - SEGMENT_LENGTH_A < SEGMENT_LENGTH_B
3027
3028 where DIFF = DR_A2->OFFSET - DR_A1->OFFSET. However,
3029 SEGMENT_LENGTH_A or SEGMENT_LENGTH_B may not be constant so we
3030 have to make a best estimation. We can get the minimum value
3031 of SEGMENT_LENGTH_B as a constant, represented by MIN_SEG_LEN_B,
3032 then either of the following two conditions can guarantee the
3033 one above:
3034
3035 1: DIFF <= MIN_SEG_LEN_B
3036 2: DIFF - SEGMENT_LENGTH_A < MIN_SEG_LEN_B
3037
3038 */
3039
3040 HOST_WIDE_INT min_seg_len_b = (tree_fits_shwi_p (dr_b1->seg_len)
3041 ? tree_to_shwi (dr_b1->seg_len)
3042 : vect_factor);
3043
3044 if (diff <= min_seg_len_b
3045 || (tree_fits_shwi_p (dr_a1->seg_len)
3046 && diff - tree_to_shwi (dr_a1->seg_len) < min_seg_len_b))
3047 {
3048 if (dump_enabled_p ())
3049 {
3050 dump_printf_loc (MSG_NOTE, vect_location,
3051 "merging ranges for ");
3052 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3053 DR_REF (dr_a1->dr));
3054 dump_printf (MSG_NOTE, ", ");
3055 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3056 DR_REF (dr_b1->dr));
3057 dump_printf (MSG_NOTE, " and ");
3058 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3059 DR_REF (dr_a2->dr));
3060 dump_printf (MSG_NOTE, ", ");
3061 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3062 DR_REF (dr_b2->dr));
3063 dump_printf (MSG_NOTE, "\n");
3064 }
3065
3066 dr_a1->seg_len = size_binop (PLUS_EXPR,
3067 dr_a2->seg_len, size_int (diff));
3068 comp_alias_ddrs.ordered_remove (i--);
3069 }
3070 }
3071 }
3072
3073 dump_printf_loc (MSG_NOTE, vect_location,
3074 "improved number of alias checks from %d to %d\n",
3075 may_alias_ddrs.length (), comp_alias_ddrs.length ());
3076 if ((int) comp_alias_ddrs.length () >
3077 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
3078 return false;
3079
3080 return true;
3081 }
3082
3083 /* Check whether a non-affine read or write in stmt is suitable for gather load
3084 or scatter store and if so, return a builtin decl for that operation. */
3085
3086 tree
3087 vect_check_gather_scatter (gimple *stmt, loop_vec_info loop_vinfo, tree *basep,
3088 tree *offp, int *scalep)
3089 {
3090 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
3091 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3092 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3093 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3094 tree offtype = NULL_TREE;
3095 tree decl, base, off;
3096 machine_mode pmode;
3097 int punsignedp, reversep, pvolatilep = 0;
3098
3099 base = DR_REF (dr);
3100 /* For masked loads/stores, DR_REF (dr) is an artificial MEM_REF,
3101 see if we can use the def stmt of the address. */
3102 if (is_gimple_call (stmt)
3103 && gimple_call_internal_p (stmt)
3104 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
3105 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
3106 && TREE_CODE (base) == MEM_REF
3107 && TREE_CODE (TREE_OPERAND (base, 0)) == SSA_NAME
3108 && integer_zerop (TREE_OPERAND (base, 1))
3109 && !expr_invariant_in_loop_p (loop, TREE_OPERAND (base, 0)))
3110 {
3111 gimple *def_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (base, 0));
3112 if (is_gimple_assign (def_stmt)
3113 && gimple_assign_rhs_code (def_stmt) == ADDR_EXPR)
3114 base = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0);
3115 }
3116
3117 /* The gather and scatter builtins need address of the form
3118 loop_invariant + vector * {1, 2, 4, 8}
3119 or
3120 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
3121 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
3122 of loop invariants/SSA_NAMEs defined in the loop, with casts,
3123 multiplications and additions in it. To get a vector, we need
3124 a single SSA_NAME that will be defined in the loop and will
3125 contain everything that is not loop invariant and that can be
3126 vectorized. The following code attempts to find such a preexistng
3127 SSA_NAME OFF and put the loop invariants into a tree BASE
3128 that can be gimplified before the loop. */
3129 base = get_inner_reference (base, &pbitsize, &pbitpos, &off, &pmode,
3130 &punsignedp, &reversep, &pvolatilep, false);
3131 gcc_assert (base && (pbitpos % BITS_PER_UNIT) == 0 && !reversep);
3132
3133 if (TREE_CODE (base) == MEM_REF)
3134 {
3135 if (!integer_zerop (TREE_OPERAND (base, 1)))
3136 {
3137 if (off == NULL_TREE)
3138 {
3139 offset_int moff = mem_ref_offset (base);
3140 off = wide_int_to_tree (sizetype, moff);
3141 }
3142 else
3143 off = size_binop (PLUS_EXPR, off,
3144 fold_convert (sizetype, TREE_OPERAND (base, 1)));
3145 }
3146 base = TREE_OPERAND (base, 0);
3147 }
3148 else
3149 base = build_fold_addr_expr (base);
3150
3151 if (off == NULL_TREE)
3152 off = size_zero_node;
3153
3154 /* If base is not loop invariant, either off is 0, then we start with just
3155 the constant offset in the loop invariant BASE and continue with base
3156 as OFF, otherwise give up.
3157 We could handle that case by gimplifying the addition of base + off
3158 into some SSA_NAME and use that as off, but for now punt. */
3159 if (!expr_invariant_in_loop_p (loop, base))
3160 {
3161 if (!integer_zerop (off))
3162 return NULL_TREE;
3163 off = base;
3164 base = size_int (pbitpos / BITS_PER_UNIT);
3165 }
3166 /* Otherwise put base + constant offset into the loop invariant BASE
3167 and continue with OFF. */
3168 else
3169 {
3170 base = fold_convert (sizetype, base);
3171 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
3172 }
3173
3174 /* OFF at this point may be either a SSA_NAME or some tree expression
3175 from get_inner_reference. Try to peel off loop invariants from it
3176 into BASE as long as possible. */
3177 STRIP_NOPS (off);
3178 while (offtype == NULL_TREE)
3179 {
3180 enum tree_code code;
3181 tree op0, op1, add = NULL_TREE;
3182
3183 if (TREE_CODE (off) == SSA_NAME)
3184 {
3185 gimple *def_stmt = SSA_NAME_DEF_STMT (off);
3186
3187 if (expr_invariant_in_loop_p (loop, off))
3188 return NULL_TREE;
3189
3190 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
3191 break;
3192
3193 op0 = gimple_assign_rhs1 (def_stmt);
3194 code = gimple_assign_rhs_code (def_stmt);
3195 op1 = gimple_assign_rhs2 (def_stmt);
3196 }
3197 else
3198 {
3199 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
3200 return NULL_TREE;
3201 code = TREE_CODE (off);
3202 extract_ops_from_tree (off, &code, &op0, &op1);
3203 }
3204 switch (code)
3205 {
3206 case POINTER_PLUS_EXPR:
3207 case PLUS_EXPR:
3208 if (expr_invariant_in_loop_p (loop, op0))
3209 {
3210 add = op0;
3211 off = op1;
3212 do_add:
3213 add = fold_convert (sizetype, add);
3214 if (scale != 1)
3215 add = size_binop (MULT_EXPR, add, size_int (scale));
3216 base = size_binop (PLUS_EXPR, base, add);
3217 continue;
3218 }
3219 if (expr_invariant_in_loop_p (loop, op1))
3220 {
3221 add = op1;
3222 off = op0;
3223 goto do_add;
3224 }
3225 break;
3226 case MINUS_EXPR:
3227 if (expr_invariant_in_loop_p (loop, op1))
3228 {
3229 add = fold_convert (sizetype, op1);
3230 add = size_binop (MINUS_EXPR, size_zero_node, add);
3231 off = op0;
3232 goto do_add;
3233 }
3234 break;
3235 case MULT_EXPR:
3236 if (scale == 1 && tree_fits_shwi_p (op1))
3237 {
3238 scale = tree_to_shwi (op1);
3239 off = op0;
3240 continue;
3241 }
3242 break;
3243 case SSA_NAME:
3244 off = op0;
3245 continue;
3246 CASE_CONVERT:
3247 if (!POINTER_TYPE_P (TREE_TYPE (op0))
3248 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
3249 break;
3250 if (TYPE_PRECISION (TREE_TYPE (op0))
3251 == TYPE_PRECISION (TREE_TYPE (off)))
3252 {
3253 off = op0;
3254 continue;
3255 }
3256 if (TYPE_PRECISION (TREE_TYPE (op0))
3257 < TYPE_PRECISION (TREE_TYPE (off)))
3258 {
3259 off = op0;
3260 offtype = TREE_TYPE (off);
3261 STRIP_NOPS (off);
3262 continue;
3263 }
3264 break;
3265 default:
3266 break;
3267 }
3268 break;
3269 }
3270
3271 /* If at the end OFF still isn't a SSA_NAME or isn't
3272 defined in the loop, punt. */
3273 if (TREE_CODE (off) != SSA_NAME
3274 || expr_invariant_in_loop_p (loop, off))
3275 return NULL_TREE;
3276
3277 if (offtype == NULL_TREE)
3278 offtype = TREE_TYPE (off);
3279
3280 if (DR_IS_READ (dr))
3281 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
3282 offtype, scale);
3283 else
3284 decl = targetm.vectorize.builtin_scatter (STMT_VINFO_VECTYPE (stmt_info),
3285 offtype, scale);
3286
3287 if (decl == NULL_TREE)
3288 return NULL_TREE;
3289
3290 if (basep)
3291 *basep = base;
3292 if (offp)
3293 *offp = off;
3294 if (scalep)
3295 *scalep = scale;
3296 return decl;
3297 }
3298
3299 /* Function vect_analyze_data_refs.
3300
3301 Find all the data references in the loop or basic block.
3302
3303 The general structure of the analysis of data refs in the vectorizer is as
3304 follows:
3305 1- vect_analyze_data_refs(loop/bb): call
3306 compute_data_dependences_for_loop/bb to find and analyze all data-refs
3307 in the loop/bb and their dependences.
3308 2- vect_analyze_dependences(): apply dependence testing using ddrs.
3309 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
3310 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
3311
3312 */
3313
3314 bool
3315 vect_analyze_data_refs (vec_info *vinfo, int *min_vf)
3316 {
3317 struct loop *loop = NULL;
3318 unsigned int i;
3319 struct data_reference *dr;
3320 tree scalar_type;
3321
3322 if (dump_enabled_p ())
3323 dump_printf_loc (MSG_NOTE, vect_location,
3324 "=== vect_analyze_data_refs ===\n");
3325
3326 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
3327 loop = LOOP_VINFO_LOOP (loop_vinfo);
3328
3329 /* Go through the data-refs, check that the analysis succeeded. Update
3330 pointer from stmt_vec_info struct to DR and vectype. */
3331
3332 vec<data_reference_p> datarefs = vinfo->datarefs;
3333 FOR_EACH_VEC_ELT (datarefs, i, dr)
3334 {
3335 gimple *stmt;
3336 stmt_vec_info stmt_info;
3337 tree base, offset, init;
3338 enum { SG_NONE, GATHER, SCATTER } gatherscatter = SG_NONE;
3339 bool simd_lane_access = false;
3340 int vf;
3341
3342 again:
3343 if (!dr || !DR_REF (dr))
3344 {
3345 if (dump_enabled_p ())
3346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3347 "not vectorized: unhandled data-ref\n");
3348 return false;
3349 }
3350
3351 stmt = DR_STMT (dr);
3352 stmt_info = vinfo_for_stmt (stmt);
3353
3354 /* Discard clobbers from the dataref vector. We will remove
3355 clobber stmts during vectorization. */
3356 if (gimple_clobber_p (stmt))
3357 {
3358 free_data_ref (dr);
3359 if (i == datarefs.length () - 1)
3360 {
3361 datarefs.pop ();
3362 break;
3363 }
3364 datarefs.ordered_remove (i);
3365 dr = datarefs[i];
3366 goto again;
3367 }
3368
3369 /* Check that analysis of the data-ref succeeded. */
3370 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3371 || !DR_STEP (dr))
3372 {
3373 bool maybe_gather
3374 = DR_IS_READ (dr)
3375 && !TREE_THIS_VOLATILE (DR_REF (dr))
3376 && targetm.vectorize.builtin_gather != NULL;
3377 bool maybe_scatter
3378 = DR_IS_WRITE (dr)
3379 && !TREE_THIS_VOLATILE (DR_REF (dr))
3380 && targetm.vectorize.builtin_scatter != NULL;
3381 bool maybe_simd_lane_access
3382 = is_a <loop_vec_info> (vinfo) && loop->simduid;
3383
3384 /* If target supports vector gather loads or scatter stores, or if
3385 this might be a SIMD lane access, see if they can't be used. */
3386 if (is_a <loop_vec_info> (vinfo)
3387 && (maybe_gather || maybe_scatter || maybe_simd_lane_access)
3388 && !nested_in_vect_loop_p (loop, stmt))
3389 {
3390 struct data_reference *newdr
3391 = create_data_ref (NULL, loop_containing_stmt (stmt),
3392 DR_REF (dr), stmt, maybe_scatter ? false : true);
3393 gcc_assert (newdr != NULL && DR_REF (newdr));
3394 if (DR_BASE_ADDRESS (newdr)
3395 && DR_OFFSET (newdr)
3396 && DR_INIT (newdr)
3397 && DR_STEP (newdr)
3398 && integer_zerop (DR_STEP (newdr)))
3399 {
3400 if (maybe_simd_lane_access)
3401 {
3402 tree off = DR_OFFSET (newdr);
3403 STRIP_NOPS (off);
3404 if (TREE_CODE (DR_INIT (newdr)) == INTEGER_CST
3405 && TREE_CODE (off) == MULT_EXPR
3406 && tree_fits_uhwi_p (TREE_OPERAND (off, 1)))
3407 {
3408 tree step = TREE_OPERAND (off, 1);
3409 off = TREE_OPERAND (off, 0);
3410 STRIP_NOPS (off);
3411 if (CONVERT_EXPR_P (off)
3412 && TYPE_PRECISION (TREE_TYPE (TREE_OPERAND (off,
3413 0)))
3414 < TYPE_PRECISION (TREE_TYPE (off)))
3415 off = TREE_OPERAND (off, 0);
3416 if (TREE_CODE (off) == SSA_NAME)
3417 {
3418 gimple *def = SSA_NAME_DEF_STMT (off);
3419 tree reft = TREE_TYPE (DR_REF (newdr));
3420 if (is_gimple_call (def)
3421 && gimple_call_internal_p (def)
3422 && (gimple_call_internal_fn (def)
3423 == IFN_GOMP_SIMD_LANE))
3424 {
3425 tree arg = gimple_call_arg (def, 0);
3426 gcc_assert (TREE_CODE (arg) == SSA_NAME);
3427 arg = SSA_NAME_VAR (arg);
3428 if (arg == loop->simduid
3429 /* For now. */
3430 && tree_int_cst_equal
3431 (TYPE_SIZE_UNIT (reft),
3432 step))
3433 {
3434 DR_OFFSET (newdr) = ssize_int (0);
3435 DR_STEP (newdr) = step;
3436 DR_ALIGNED_TO (newdr)
3437 = size_int (BIGGEST_ALIGNMENT);
3438 dr = newdr;
3439 simd_lane_access = true;
3440 }
3441 }
3442 }
3443 }
3444 }
3445 if (!simd_lane_access && (maybe_gather || maybe_scatter))
3446 {
3447 dr = newdr;
3448 if (maybe_gather)
3449 gatherscatter = GATHER;
3450 else
3451 gatherscatter = SCATTER;
3452 }
3453 }
3454 if (gatherscatter == SG_NONE && !simd_lane_access)
3455 free_data_ref (newdr);
3456 }
3457
3458 if (gatherscatter == SG_NONE && !simd_lane_access)
3459 {
3460 if (dump_enabled_p ())
3461 {
3462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3463 "not vectorized: data ref analysis "
3464 "failed ");
3465 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3466 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3467 }
3468
3469 if (is_a <bb_vec_info> (vinfo))
3470 break;
3471
3472 return false;
3473 }
3474 }
3475
3476 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3477 {
3478 if (dump_enabled_p ())
3479 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3480 "not vectorized: base addr of dr is a "
3481 "constant\n");
3482
3483 if (is_a <bb_vec_info> (vinfo))
3484 break;
3485
3486 if (gatherscatter != SG_NONE || simd_lane_access)
3487 free_data_ref (dr);
3488 return false;
3489 }
3490
3491 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3492 {
3493 if (dump_enabled_p ())
3494 {
3495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3496 "not vectorized: volatile type ");
3497 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3498 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3499 }
3500
3501 if (is_a <bb_vec_info> (vinfo))
3502 break;
3503
3504 return false;
3505 }
3506
3507 if (stmt_can_throw_internal (stmt))
3508 {
3509 if (dump_enabled_p ())
3510 {
3511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3512 "not vectorized: statement can throw an "
3513 "exception ");
3514 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3515 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3516 }
3517
3518 if (is_a <bb_vec_info> (vinfo))
3519 break;
3520
3521 if (gatherscatter != SG_NONE || simd_lane_access)
3522 free_data_ref (dr);
3523 return false;
3524 }
3525
3526 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3527 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3528 {
3529 if (dump_enabled_p ())
3530 {
3531 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3532 "not vectorized: statement is bitfield "
3533 "access ");
3534 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3535 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3536 }
3537
3538 if (is_a <bb_vec_info> (vinfo))
3539 break;
3540
3541 if (gatherscatter != SG_NONE || simd_lane_access)
3542 free_data_ref (dr);
3543 return false;
3544 }
3545
3546 base = unshare_expr (DR_BASE_ADDRESS (dr));
3547 offset = unshare_expr (DR_OFFSET (dr));
3548 init = unshare_expr (DR_INIT (dr));
3549
3550 if (is_gimple_call (stmt)
3551 && (!gimple_call_internal_p (stmt)
3552 || (gimple_call_internal_fn (stmt) != IFN_MASK_LOAD
3553 && gimple_call_internal_fn (stmt) != IFN_MASK_STORE)))
3554 {
3555 if (dump_enabled_p ())
3556 {
3557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3558 "not vectorized: dr in a call ");
3559 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3560 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3561 }
3562
3563 if (is_a <bb_vec_info> (vinfo))
3564 break;
3565
3566 if (gatherscatter != SG_NONE || simd_lane_access)
3567 free_data_ref (dr);
3568 return false;
3569 }
3570
3571 /* Update DR field in stmt_vec_info struct. */
3572
3573 /* If the dataref is in an inner-loop of the loop that is considered for
3574 for vectorization, we also want to analyze the access relative to
3575 the outer-loop (DR contains information only relative to the
3576 inner-most enclosing loop). We do that by building a reference to the
3577 first location accessed by the inner-loop, and analyze it relative to
3578 the outer-loop. */
3579 if (loop && nested_in_vect_loop_p (loop, stmt))
3580 {
3581 tree outer_step, outer_base, outer_init;
3582 HOST_WIDE_INT pbitsize, pbitpos;
3583 tree poffset;
3584 machine_mode pmode;
3585 int punsignedp, preversep, pvolatilep;
3586 affine_iv base_iv, offset_iv;
3587 tree dinit;
3588
3589 /* Build a reference to the first location accessed by the
3590 inner-loop: *(BASE+INIT). (The first location is actually
3591 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3592 tree inner_base = build_fold_indirect_ref
3593 (fold_build_pointer_plus (base, init));
3594
3595 if (dump_enabled_p ())
3596 {
3597 dump_printf_loc (MSG_NOTE, vect_location,
3598 "analyze in outer-loop: ");
3599 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3600 dump_printf (MSG_NOTE, "\n");
3601 }
3602
3603 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3604 &poffset, &pmode, &punsignedp,
3605 &preversep, &pvolatilep, false);
3606 gcc_assert (outer_base != NULL_TREE);
3607
3608 if (pbitpos % BITS_PER_UNIT != 0)
3609 {
3610 if (dump_enabled_p ())
3611 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3612 "failed: bit offset alignment.\n");
3613 return false;
3614 }
3615
3616 if (preversep)
3617 {
3618 if (dump_enabled_p ())
3619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3620 "failed: reverse storage order.\n");
3621 return false;
3622 }
3623
3624 outer_base = build_fold_addr_expr (outer_base);
3625 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3626 &base_iv, false))
3627 {
3628 if (dump_enabled_p ())
3629 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3630 "failed: evolution of base is not affine.\n");
3631 return false;
3632 }
3633
3634 if (offset)
3635 {
3636 if (poffset)
3637 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3638 poffset);
3639 else
3640 poffset = offset;
3641 }
3642
3643 if (!poffset)
3644 {
3645 offset_iv.base = ssize_int (0);
3646 offset_iv.step = ssize_int (0);
3647 }
3648 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3649 &offset_iv, false))
3650 {
3651 if (dump_enabled_p ())
3652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3653 "evolution of offset is not affine.\n");
3654 return false;
3655 }
3656
3657 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3658 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3659 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3660 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3661 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3662
3663 outer_step = size_binop (PLUS_EXPR,
3664 fold_convert (ssizetype, base_iv.step),
3665 fold_convert (ssizetype, offset_iv.step));
3666
3667 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3668 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3669 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3670 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3671 STMT_VINFO_DR_OFFSET (stmt_info) =
3672 fold_convert (ssizetype, offset_iv.base);
3673 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3674 size_int (highest_pow2_factor (offset_iv.base));
3675
3676 if (dump_enabled_p ())
3677 {
3678 dump_printf_loc (MSG_NOTE, vect_location,
3679 "\touter base_address: ");
3680 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3681 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3682 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3683 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3684 STMT_VINFO_DR_OFFSET (stmt_info));
3685 dump_printf (MSG_NOTE,
3686 "\n\touter constant offset from base address: ");
3687 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3688 STMT_VINFO_DR_INIT (stmt_info));
3689 dump_printf (MSG_NOTE, "\n\touter step: ");
3690 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3691 STMT_VINFO_DR_STEP (stmt_info));
3692 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3693 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3694 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3695 dump_printf (MSG_NOTE, "\n");
3696 }
3697 }
3698
3699 if (STMT_VINFO_DATA_REF (stmt_info))
3700 {
3701 if (dump_enabled_p ())
3702 {
3703 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3704 "not vectorized: more than one data ref "
3705 "in stmt: ");
3706 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3707 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3708 }
3709
3710 if (is_a <bb_vec_info> (vinfo))
3711 break;
3712
3713 if (gatherscatter != SG_NONE || simd_lane_access)
3714 free_data_ref (dr);
3715 return false;
3716 }
3717
3718 STMT_VINFO_DATA_REF (stmt_info) = dr;
3719 if (simd_lane_access)
3720 {
3721 STMT_VINFO_SIMD_LANE_ACCESS_P (stmt_info) = true;
3722 free_data_ref (datarefs[i]);
3723 datarefs[i] = dr;
3724 }
3725
3726 /* Set vectype for STMT. */
3727 scalar_type = TREE_TYPE (DR_REF (dr));
3728 STMT_VINFO_VECTYPE (stmt_info)
3729 = get_vectype_for_scalar_type (scalar_type);
3730 if (!STMT_VINFO_VECTYPE (stmt_info))
3731 {
3732 if (dump_enabled_p ())
3733 {
3734 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3735 "not vectorized: no vectype for stmt: ");
3736 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3737 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3738 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3739 scalar_type);
3740 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3741 }
3742
3743 if (is_a <bb_vec_info> (vinfo))
3744 {
3745 /* No vector type is fine, the ref can still participate
3746 in dependence analysis, we just can't vectorize it. */
3747 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3748 continue;
3749 }
3750
3751 if (gatherscatter != SG_NONE || simd_lane_access)
3752 {
3753 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3754 if (gatherscatter != SG_NONE)
3755 free_data_ref (dr);
3756 }
3757 return false;
3758 }
3759 else
3760 {
3761 if (dump_enabled_p ())
3762 {
3763 dump_printf_loc (MSG_NOTE, vect_location,
3764 "got vectype for stmt: ");
3765 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3766 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3767 STMT_VINFO_VECTYPE (stmt_info));
3768 dump_printf (MSG_NOTE, "\n");
3769 }
3770 }
3771
3772 /* Adjust the minimal vectorization factor according to the
3773 vector type. */
3774 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3775 if (vf > *min_vf)
3776 *min_vf = vf;
3777
3778 if (gatherscatter != SG_NONE)
3779 {
3780 tree off;
3781 if (!vect_check_gather_scatter (stmt, as_a <loop_vec_info> (vinfo),
3782 NULL, &off, NULL)
3783 || get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3784 {
3785 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3786 free_data_ref (dr);
3787 if (dump_enabled_p ())
3788 {
3789 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3790 (gatherscatter == GATHER) ?
3791 "not vectorized: not suitable for gather "
3792 "load " :
3793 "not vectorized: not suitable for scatter "
3794 "store ");
3795 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3796 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3797 }
3798 return false;
3799 }
3800
3801 datarefs[i] = dr;
3802 STMT_VINFO_GATHER_SCATTER_P (stmt_info) = gatherscatter;
3803 }
3804
3805 else if (is_a <loop_vec_info> (vinfo)
3806 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3807 {
3808 if (nested_in_vect_loop_p (loop, stmt))
3809 {
3810 if (dump_enabled_p ())
3811 {
3812 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3813 "not vectorized: not suitable for strided "
3814 "load ");
3815 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3816 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3817 }
3818 return false;
3819 }
3820 STMT_VINFO_STRIDED_P (stmt_info) = true;
3821 }
3822 }
3823
3824 /* If we stopped analysis at the first dataref we could not analyze
3825 when trying to vectorize a basic-block mark the rest of the datarefs
3826 as not vectorizable and truncate the vector of datarefs. That
3827 avoids spending useless time in analyzing their dependence. */
3828 if (i != datarefs.length ())
3829 {
3830 gcc_assert (is_a <bb_vec_info> (vinfo));
3831 for (unsigned j = i; j < datarefs.length (); ++j)
3832 {
3833 data_reference_p dr = datarefs[j];
3834 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3835 free_data_ref (dr);
3836 }
3837 datarefs.truncate (i);
3838 }
3839
3840 return true;
3841 }
3842
3843
3844 /* Function vect_get_new_vect_var.
3845
3846 Returns a name for a new variable. The current naming scheme appends the
3847 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3848 the name of vectorizer generated variables, and appends that to NAME if
3849 provided. */
3850
3851 tree
3852 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3853 {
3854 const char *prefix;
3855 tree new_vect_var;
3856
3857 switch (var_kind)
3858 {
3859 case vect_simple_var:
3860 prefix = "vect";
3861 break;
3862 case vect_scalar_var:
3863 prefix = "stmp";
3864 break;
3865 case vect_pointer_var:
3866 prefix = "vectp";
3867 break;
3868 default:
3869 gcc_unreachable ();
3870 }
3871
3872 if (name)
3873 {
3874 char* tmp = concat (prefix, "_", name, NULL);
3875 new_vect_var = create_tmp_reg (type, tmp);
3876 free (tmp);
3877 }
3878 else
3879 new_vect_var = create_tmp_reg (type, prefix);
3880
3881 return new_vect_var;
3882 }
3883
3884 /* Like vect_get_new_vect_var but return an SSA name. */
3885
3886 tree
3887 vect_get_new_ssa_name (tree type, enum vect_var_kind var_kind, const char *name)
3888 {
3889 const char *prefix;
3890 tree new_vect_var;
3891
3892 switch (var_kind)
3893 {
3894 case vect_simple_var:
3895 prefix = "vect";
3896 break;
3897 case vect_scalar_var:
3898 prefix = "stmp";
3899 break;
3900 case vect_pointer_var:
3901 prefix = "vectp";
3902 break;
3903 default:
3904 gcc_unreachable ();
3905 }
3906
3907 if (name)
3908 {
3909 char* tmp = concat (prefix, "_", name, NULL);
3910 new_vect_var = make_temp_ssa_name (type, NULL, tmp);
3911 free (tmp);
3912 }
3913 else
3914 new_vect_var = make_temp_ssa_name (type, NULL, prefix);
3915
3916 return new_vect_var;
3917 }
3918
3919 /* Duplicate ptr info and set alignment/misaligment on NAME from DR. */
3920
3921 static void
3922 vect_duplicate_ssa_name_ptr_info (tree name, data_reference *dr,
3923 stmt_vec_info stmt_info)
3924 {
3925 duplicate_ssa_name_ptr_info (name, DR_PTR_INFO (dr));
3926 unsigned int align = TYPE_ALIGN_UNIT (STMT_VINFO_VECTYPE (stmt_info));
3927 int misalign = DR_MISALIGNMENT (dr);
3928 if (misalign == -1)
3929 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (name));
3930 else
3931 set_ptr_info_alignment (SSA_NAME_PTR_INFO (name), align, misalign);
3932 }
3933
3934 /* Function vect_create_addr_base_for_vector_ref.
3935
3936 Create an expression that computes the address of the first memory location
3937 that will be accessed for a data reference.
3938
3939 Input:
3940 STMT: The statement containing the data reference.
3941 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3942 OFFSET: Optional. If supplied, it is be added to the initial address.
3943 LOOP: Specify relative to which loop-nest should the address be computed.
3944 For example, when the dataref is in an inner-loop nested in an
3945 outer-loop that is now being vectorized, LOOP can be either the
3946 outer-loop, or the inner-loop. The first memory location accessed
3947 by the following dataref ('in' points to short):
3948
3949 for (i=0; i<N; i++)
3950 for (j=0; j<M; j++)
3951 s += in[i+j]
3952
3953 is as follows:
3954 if LOOP=i_loop: &in (relative to i_loop)
3955 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3956 BYTE_OFFSET: Optional, defaulted to NULL. If supplied, it is added to the
3957 initial address. Unlike OFFSET, which is number of elements to
3958 be added, BYTE_OFFSET is measured in bytes.
3959
3960 Output:
3961 1. Return an SSA_NAME whose value is the address of the memory location of
3962 the first vector of the data reference.
3963 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3964 these statement(s) which define the returned SSA_NAME.
3965
3966 FORNOW: We are only handling array accesses with step 1. */
3967
3968 tree
3969 vect_create_addr_base_for_vector_ref (gimple *stmt,
3970 gimple_seq *new_stmt_list,
3971 tree offset,
3972 struct loop *loop,
3973 tree byte_offset)
3974 {
3975 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3976 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3977 tree data_ref_base;
3978 const char *base_name;
3979 tree addr_base;
3980 tree dest;
3981 gimple_seq seq = NULL;
3982 tree base_offset;
3983 tree init;
3984 tree vect_ptr_type;
3985 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3986 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3987
3988 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3989 {
3990 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3991
3992 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3993
3994 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3995 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3996 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3997 }
3998 else
3999 {
4000 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
4001 base_offset = unshare_expr (DR_OFFSET (dr));
4002 init = unshare_expr (DR_INIT (dr));
4003 }
4004
4005 if (loop_vinfo)
4006 base_name = get_name (data_ref_base);
4007 else
4008 {
4009 base_offset = ssize_int (0);
4010 init = ssize_int (0);
4011 base_name = get_name (DR_REF (dr));
4012 }
4013
4014 /* Create base_offset */
4015 base_offset = size_binop (PLUS_EXPR,
4016 fold_convert (sizetype, base_offset),
4017 fold_convert (sizetype, init));
4018
4019 if (offset)
4020 {
4021 offset = fold_build2 (MULT_EXPR, sizetype,
4022 fold_convert (sizetype, offset), step);
4023 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4024 base_offset, offset);
4025 }
4026 if (byte_offset)
4027 {
4028 byte_offset = fold_convert (sizetype, byte_offset);
4029 base_offset = fold_build2 (PLUS_EXPR, sizetype,
4030 base_offset, byte_offset);
4031 }
4032
4033 /* base + base_offset */
4034 if (loop_vinfo)
4035 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
4036 else
4037 {
4038 addr_base = build1 (ADDR_EXPR,
4039 build_pointer_type (TREE_TYPE (DR_REF (dr))),
4040 unshare_expr (DR_REF (dr)));
4041 }
4042
4043 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
4044 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
4045 addr_base = force_gimple_operand (addr_base, &seq, true, dest);
4046 gimple_seq_add_seq (new_stmt_list, seq);
4047
4048 if (DR_PTR_INFO (dr)
4049 && TREE_CODE (addr_base) == SSA_NAME
4050 && !SSA_NAME_PTR_INFO (addr_base))
4051 {
4052 vect_duplicate_ssa_name_ptr_info (addr_base, dr, stmt_info);
4053 if (offset || byte_offset)
4054 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
4055 }
4056
4057 if (dump_enabled_p ())
4058 {
4059 dump_printf_loc (MSG_NOTE, vect_location, "created ");
4060 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
4061 dump_printf (MSG_NOTE, "\n");
4062 }
4063
4064 return addr_base;
4065 }
4066
4067
4068 /* Function vect_create_data_ref_ptr.
4069
4070 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
4071 location accessed in the loop by STMT, along with the def-use update
4072 chain to appropriately advance the pointer through the loop iterations.
4073 Also set aliasing information for the pointer. This pointer is used by
4074 the callers to this function to create a memory reference expression for
4075 vector load/store access.
4076
4077 Input:
4078 1. STMT: a stmt that references memory. Expected to be of the form
4079 GIMPLE_ASSIGN <name, data-ref> or
4080 GIMPLE_ASSIGN <data-ref, name>.
4081 2. AGGR_TYPE: the type of the reference, which should be either a vector
4082 or an array.
4083 3. AT_LOOP: the loop where the vector memref is to be created.
4084 4. OFFSET (optional): an offset to be added to the initial address accessed
4085 by the data-ref in STMT.
4086 5. BSI: location where the new stmts are to be placed if there is no loop
4087 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
4088 pointing to the initial address.
4089 7. BYTE_OFFSET (optional, defaults to NULL): a byte offset to be added
4090 to the initial address accessed by the data-ref in STMT. This is
4091 similar to OFFSET, but OFFSET is counted in elements, while BYTE_OFFSET
4092 in bytes.
4093
4094 Output:
4095 1. Declare a new ptr to vector_type, and have it point to the base of the
4096 data reference (initial addressed accessed by the data reference).
4097 For example, for vector of type V8HI, the following code is generated:
4098
4099 v8hi *ap;
4100 ap = (v8hi *)initial_address;
4101
4102 if OFFSET is not supplied:
4103 initial_address = &a[init];
4104 if OFFSET is supplied:
4105 initial_address = &a[init + OFFSET];
4106 if BYTE_OFFSET is supplied:
4107 initial_address = &a[init] + BYTE_OFFSET;
4108
4109 Return the initial_address in INITIAL_ADDRESS.
4110
4111 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
4112 update the pointer in each iteration of the loop.
4113
4114 Return the increment stmt that updates the pointer in PTR_INCR.
4115
4116 3. Set INV_P to true if the access pattern of the data reference in the
4117 vectorized loop is invariant. Set it to false otherwise.
4118
4119 4. Return the pointer. */
4120
4121 tree
4122 vect_create_data_ref_ptr (gimple *stmt, tree aggr_type, struct loop *at_loop,
4123 tree offset, tree *initial_address,
4124 gimple_stmt_iterator *gsi, gimple **ptr_incr,
4125 bool only_init, bool *inv_p, tree byte_offset)
4126 {
4127 const char *base_name;
4128 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4129 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4130 struct loop *loop = NULL;
4131 bool nested_in_vect_loop = false;
4132 struct loop *containing_loop = NULL;
4133 tree aggr_ptr_type;
4134 tree aggr_ptr;
4135 tree new_temp;
4136 gimple_seq new_stmt_list = NULL;
4137 edge pe = NULL;
4138 basic_block new_bb;
4139 tree aggr_ptr_init;
4140 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4141 tree aptr;
4142 gimple_stmt_iterator incr_gsi;
4143 bool insert_after;
4144 tree indx_before_incr, indx_after_incr;
4145 gimple *incr;
4146 tree step;
4147 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
4148
4149 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
4150 || TREE_CODE (aggr_type) == VECTOR_TYPE);
4151
4152 if (loop_vinfo)
4153 {
4154 loop = LOOP_VINFO_LOOP (loop_vinfo);
4155 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4156 containing_loop = (gimple_bb (stmt))->loop_father;
4157 pe = loop_preheader_edge (loop);
4158 }
4159 else
4160 {
4161 gcc_assert (bb_vinfo);
4162 only_init = true;
4163 *ptr_incr = NULL;
4164 }
4165
4166 /* Check the step (evolution) of the load in LOOP, and record
4167 whether it's invariant. */
4168 if (nested_in_vect_loop)
4169 step = STMT_VINFO_DR_STEP (stmt_info);
4170 else
4171 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
4172
4173 if (integer_zerop (step))
4174 *inv_p = true;
4175 else
4176 *inv_p = false;
4177
4178 /* Create an expression for the first address accessed by this load
4179 in LOOP. */
4180 base_name = get_name (DR_BASE_ADDRESS (dr));
4181
4182 if (dump_enabled_p ())
4183 {
4184 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
4185 dump_printf_loc (MSG_NOTE, vect_location,
4186 "create %s-pointer variable to type: ",
4187 get_tree_code_name (TREE_CODE (aggr_type)));
4188 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
4189 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
4190 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
4191 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
4192 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
4193 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
4194 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
4195 else
4196 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
4197 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
4198 dump_printf (MSG_NOTE, "\n");
4199 }
4200
4201 /* (1) Create the new aggregate-pointer variable.
4202 Vector and array types inherit the alias set of their component
4203 type by default so we need to use a ref-all pointer if the data
4204 reference does not conflict with the created aggregated data
4205 reference because it is not addressable. */
4206 bool need_ref_all = false;
4207 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4208 get_alias_set (DR_REF (dr))))
4209 need_ref_all = true;
4210 /* Likewise for any of the data references in the stmt group. */
4211 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
4212 {
4213 gimple *orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
4214 do
4215 {
4216 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
4217 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
4218 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
4219 get_alias_set (DR_REF (sdr))))
4220 {
4221 need_ref_all = true;
4222 break;
4223 }
4224 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
4225 }
4226 while (orig_stmt);
4227 }
4228 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
4229 need_ref_all);
4230 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
4231
4232
4233 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
4234 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
4235 def-use update cycles for the pointer: one relative to the outer-loop
4236 (LOOP), which is what steps (3) and (4) below do. The other is relative
4237 to the inner-loop (which is the inner-most loop containing the dataref),
4238 and this is done be step (5) below.
4239
4240 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
4241 inner-most loop, and so steps (3),(4) work the same, and step (5) is
4242 redundant. Steps (3),(4) create the following:
4243
4244 vp0 = &base_addr;
4245 LOOP: vp1 = phi(vp0,vp2)
4246 ...
4247 ...
4248 vp2 = vp1 + step
4249 goto LOOP
4250
4251 If there is an inner-loop nested in loop, then step (5) will also be
4252 applied, and an additional update in the inner-loop will be created:
4253
4254 vp0 = &base_addr;
4255 LOOP: vp1 = phi(vp0,vp2)
4256 ...
4257 inner: vp3 = phi(vp1,vp4)
4258 vp4 = vp3 + inner_step
4259 if () goto inner
4260 ...
4261 vp2 = vp1 + step
4262 if () goto LOOP */
4263
4264 /* (2) Calculate the initial address of the aggregate-pointer, and set
4265 the aggregate-pointer to point to it before the loop. */
4266
4267 /* Create: (&(base[init_val+offset]+byte_offset) in the loop preheader. */
4268
4269 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
4270 offset, loop, byte_offset);
4271 if (new_stmt_list)
4272 {
4273 if (pe)
4274 {
4275 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
4276 gcc_assert (!new_bb);
4277 }
4278 else
4279 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
4280 }
4281
4282 *initial_address = new_temp;
4283 aggr_ptr_init = new_temp;
4284
4285 /* (3) Handle the updating of the aggregate-pointer inside the loop.
4286 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
4287 inner-loop nested in LOOP (during outer-loop vectorization). */
4288
4289 /* No update in loop is required. */
4290 if (only_init && (!loop_vinfo || at_loop == loop))
4291 aptr = aggr_ptr_init;
4292 else
4293 {
4294 /* The step of the aggregate pointer is the type size. */
4295 tree iv_step = TYPE_SIZE_UNIT (aggr_type);
4296 /* One exception to the above is when the scalar step of the load in
4297 LOOP is zero. In this case the step here is also zero. */
4298 if (*inv_p)
4299 iv_step = size_zero_node;
4300 else if (tree_int_cst_sgn (step) == -1)
4301 iv_step = fold_build1 (NEGATE_EXPR, TREE_TYPE (iv_step), iv_step);
4302
4303 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4304
4305 create_iv (aggr_ptr_init,
4306 fold_convert (aggr_ptr_type, iv_step),
4307 aggr_ptr, loop, &incr_gsi, insert_after,
4308 &indx_before_incr, &indx_after_incr);
4309 incr = gsi_stmt (incr_gsi);
4310 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4311
4312 /* Copy the points-to information if it exists. */
4313 if (DR_PTR_INFO (dr))
4314 {
4315 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4316 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4317 }
4318 if (ptr_incr)
4319 *ptr_incr = incr;
4320
4321 aptr = indx_before_incr;
4322 }
4323
4324 if (!nested_in_vect_loop || only_init)
4325 return aptr;
4326
4327
4328 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
4329 nested in LOOP, if exists. */
4330
4331 gcc_assert (nested_in_vect_loop);
4332 if (!only_init)
4333 {
4334 standard_iv_increment_position (containing_loop, &incr_gsi,
4335 &insert_after);
4336 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
4337 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
4338 &indx_after_incr);
4339 incr = gsi_stmt (incr_gsi);
4340 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo));
4341
4342 /* Copy the points-to information if it exists. */
4343 if (DR_PTR_INFO (dr))
4344 {
4345 vect_duplicate_ssa_name_ptr_info (indx_before_incr, dr, stmt_info);
4346 vect_duplicate_ssa_name_ptr_info (indx_after_incr, dr, stmt_info);
4347 }
4348 if (ptr_incr)
4349 *ptr_incr = incr;
4350
4351 return indx_before_incr;
4352 }
4353 else
4354 gcc_unreachable ();
4355 }
4356
4357
4358 /* Function bump_vector_ptr
4359
4360 Increment a pointer (to a vector type) by vector-size. If requested,
4361 i.e. if PTR-INCR is given, then also connect the new increment stmt
4362 to the existing def-use update-chain of the pointer, by modifying
4363 the PTR_INCR as illustrated below:
4364
4365 The pointer def-use update-chain before this function:
4366 DATAREF_PTR = phi (p_0, p_2)
4367 ....
4368 PTR_INCR: p_2 = DATAREF_PTR + step
4369
4370 The pointer def-use update-chain after this function:
4371 DATAREF_PTR = phi (p_0, p_2)
4372 ....
4373 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4374 ....
4375 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4376
4377 Input:
4378 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4379 in the loop.
4380 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4381 the loop. The increment amount across iterations is expected
4382 to be vector_size.
4383 BSI - location where the new update stmt is to be placed.
4384 STMT - the original scalar memory-access stmt that is being vectorized.
4385 BUMP - optional. The offset by which to bump the pointer. If not given,
4386 the offset is assumed to be vector_size.
4387
4388 Output: Return NEW_DATAREF_PTR as illustrated above.
4389
4390 */
4391
4392 tree
4393 bump_vector_ptr (tree dataref_ptr, gimple *ptr_incr, gimple_stmt_iterator *gsi,
4394 gimple *stmt, tree bump)
4395 {
4396 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4397 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4398 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4399 tree update = TYPE_SIZE_UNIT (vectype);
4400 gassign *incr_stmt;
4401 ssa_op_iter iter;
4402 use_operand_p use_p;
4403 tree new_dataref_ptr;
4404
4405 if (bump)
4406 update = bump;
4407
4408 if (TREE_CODE (dataref_ptr) == SSA_NAME)
4409 new_dataref_ptr = copy_ssa_name (dataref_ptr);
4410 else
4411 new_dataref_ptr = make_ssa_name (TREE_TYPE (dataref_ptr));
4412 incr_stmt = gimple_build_assign (new_dataref_ptr, POINTER_PLUS_EXPR,
4413 dataref_ptr, update);
4414 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4415
4416 /* Copy the points-to information if it exists. */
4417 if (DR_PTR_INFO (dr))
4418 {
4419 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4420 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4421 }
4422
4423 if (!ptr_incr)
4424 return new_dataref_ptr;
4425
4426 /* Update the vector-pointer's cross-iteration increment. */
4427 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4428 {
4429 tree use = USE_FROM_PTR (use_p);
4430
4431 if (use == dataref_ptr)
4432 SET_USE (use_p, new_dataref_ptr);
4433 else
4434 gcc_assert (tree_int_cst_compare (use, update) == 0);
4435 }
4436
4437 return new_dataref_ptr;
4438 }
4439
4440
4441 /* Function vect_create_destination_var.
4442
4443 Create a new temporary of type VECTYPE. */
4444
4445 tree
4446 vect_create_destination_var (tree scalar_dest, tree vectype)
4447 {
4448 tree vec_dest;
4449 const char *name;
4450 char *new_name;
4451 tree type;
4452 enum vect_var_kind kind;
4453
4454 kind = vectype ? vect_simple_var : vect_scalar_var;
4455 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4456
4457 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4458
4459 name = get_name (scalar_dest);
4460 if (name)
4461 new_name = xasprintf ("%s_%u", name, SSA_NAME_VERSION (scalar_dest));
4462 else
4463 new_name = xasprintf ("_%u", SSA_NAME_VERSION (scalar_dest));
4464 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4465 free (new_name);
4466
4467 return vec_dest;
4468 }
4469
4470 /* Function vect_grouped_store_supported.
4471
4472 Returns TRUE if interleave high and interleave low permutations
4473 are supported, and FALSE otherwise. */
4474
4475 bool
4476 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4477 {
4478 machine_mode mode = TYPE_MODE (vectype);
4479
4480 /* vect_permute_store_chain requires the group size to be equal to 3 or
4481 be a power of two. */
4482 if (count != 3 && exact_log2 (count) == -1)
4483 {
4484 if (dump_enabled_p ())
4485 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4486 "the size of the group of accesses"
4487 " is not a power of 2 or not eqaul to 3\n");
4488 return false;
4489 }
4490
4491 /* Check that the permutation is supported. */
4492 if (VECTOR_MODE_P (mode))
4493 {
4494 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4495 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4496
4497 if (count == 3)
4498 {
4499 unsigned int j0 = 0, j1 = 0, j2 = 0;
4500 unsigned int i, j;
4501
4502 for (j = 0; j < 3; j++)
4503 {
4504 int nelt0 = ((3 - j) * nelt) % 3;
4505 int nelt1 = ((3 - j) * nelt + 1) % 3;
4506 int nelt2 = ((3 - j) * nelt + 2) % 3;
4507 for (i = 0; i < nelt; i++)
4508 {
4509 if (3 * i + nelt0 < nelt)
4510 sel[3 * i + nelt0] = j0++;
4511 if (3 * i + nelt1 < nelt)
4512 sel[3 * i + nelt1] = nelt + j1++;
4513 if (3 * i + nelt2 < nelt)
4514 sel[3 * i + nelt2] = 0;
4515 }
4516 if (!can_vec_perm_p (mode, false, sel))
4517 {
4518 if (dump_enabled_p ())
4519 dump_printf (MSG_MISSED_OPTIMIZATION,
4520 "permutaion op not supported by target.\n");
4521 return false;
4522 }
4523
4524 for (i = 0; i < nelt; i++)
4525 {
4526 if (3 * i + nelt0 < nelt)
4527 sel[3 * i + nelt0] = 3 * i + nelt0;
4528 if (3 * i + nelt1 < nelt)
4529 sel[3 * i + nelt1] = 3 * i + nelt1;
4530 if (3 * i + nelt2 < nelt)
4531 sel[3 * i + nelt2] = nelt + j2++;
4532 }
4533 if (!can_vec_perm_p (mode, false, sel))
4534 {
4535 if (dump_enabled_p ())
4536 dump_printf (MSG_MISSED_OPTIMIZATION,
4537 "permutaion op not supported by target.\n");
4538 return false;
4539 }
4540 }
4541 return true;
4542 }
4543 else
4544 {
4545 /* If length is not equal to 3 then only power of 2 is supported. */
4546 gcc_assert (exact_log2 (count) != -1);
4547
4548 for (i = 0; i < nelt / 2; i++)
4549 {
4550 sel[i * 2] = i;
4551 sel[i * 2 + 1] = i + nelt;
4552 }
4553 if (can_vec_perm_p (mode, false, sel))
4554 {
4555 for (i = 0; i < nelt; i++)
4556 sel[i] += nelt / 2;
4557 if (can_vec_perm_p (mode, false, sel))
4558 return true;
4559 }
4560 }
4561 }
4562
4563 if (dump_enabled_p ())
4564 dump_printf (MSG_MISSED_OPTIMIZATION,
4565 "permutaion op not supported by target.\n");
4566 return false;
4567 }
4568
4569
4570 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4571 type VECTYPE. */
4572
4573 bool
4574 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4575 {
4576 return vect_lanes_optab_supported_p ("vec_store_lanes",
4577 vec_store_lanes_optab,
4578 vectype, count);
4579 }
4580
4581
4582 /* Function vect_permute_store_chain.
4583
4584 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4585 a power of 2 or equal to 3, generate interleave_high/low stmts to reorder
4586 the data correctly for the stores. Return the final references for stores
4587 in RESULT_CHAIN.
4588
4589 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4590 The input is 4 vectors each containing 8 elements. We assign a number to
4591 each element, the input sequence is:
4592
4593 1st vec: 0 1 2 3 4 5 6 7
4594 2nd vec: 8 9 10 11 12 13 14 15
4595 3rd vec: 16 17 18 19 20 21 22 23
4596 4th vec: 24 25 26 27 28 29 30 31
4597
4598 The output sequence should be:
4599
4600 1st vec: 0 8 16 24 1 9 17 25
4601 2nd vec: 2 10 18 26 3 11 19 27
4602 3rd vec: 4 12 20 28 5 13 21 30
4603 4th vec: 6 14 22 30 7 15 23 31
4604
4605 i.e., we interleave the contents of the four vectors in their order.
4606
4607 We use interleave_high/low instructions to create such output. The input of
4608 each interleave_high/low operation is two vectors:
4609 1st vec 2nd vec
4610 0 1 2 3 4 5 6 7
4611 the even elements of the result vector are obtained left-to-right from the
4612 high/low elements of the first vector. The odd elements of the result are
4613 obtained left-to-right from the high/low elements of the second vector.
4614 The output of interleave_high will be: 0 4 1 5
4615 and of interleave_low: 2 6 3 7
4616
4617
4618 The permutation is done in log LENGTH stages. In each stage interleave_high
4619 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4620 where the first argument is taken from the first half of DR_CHAIN and the
4621 second argument from it's second half.
4622 In our example,
4623
4624 I1: interleave_high (1st vec, 3rd vec)
4625 I2: interleave_low (1st vec, 3rd vec)
4626 I3: interleave_high (2nd vec, 4th vec)
4627 I4: interleave_low (2nd vec, 4th vec)
4628
4629 The output for the first stage is:
4630
4631 I1: 0 16 1 17 2 18 3 19
4632 I2: 4 20 5 21 6 22 7 23
4633 I3: 8 24 9 25 10 26 11 27
4634 I4: 12 28 13 29 14 30 15 31
4635
4636 The output of the second stage, i.e. the final result is:
4637
4638 I1: 0 8 16 24 1 9 17 25
4639 I2: 2 10 18 26 3 11 19 27
4640 I3: 4 12 20 28 5 13 21 30
4641 I4: 6 14 22 30 7 15 23 31. */
4642
4643 void
4644 vect_permute_store_chain (vec<tree> dr_chain,
4645 unsigned int length,
4646 gimple *stmt,
4647 gimple_stmt_iterator *gsi,
4648 vec<tree> *result_chain)
4649 {
4650 tree vect1, vect2, high, low;
4651 gimple *perm_stmt;
4652 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4653 tree perm_mask_low, perm_mask_high;
4654 tree data_ref;
4655 tree perm3_mask_low, perm3_mask_high;
4656 unsigned int i, n, log_length = exact_log2 (length);
4657 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4658 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4659
4660 result_chain->quick_grow (length);
4661 memcpy (result_chain->address (), dr_chain.address (),
4662 length * sizeof (tree));
4663
4664 if (length == 3)
4665 {
4666 unsigned int j0 = 0, j1 = 0, j2 = 0;
4667
4668 for (j = 0; j < 3; j++)
4669 {
4670 int nelt0 = ((3 - j) * nelt) % 3;
4671 int nelt1 = ((3 - j) * nelt + 1) % 3;
4672 int nelt2 = ((3 - j) * nelt + 2) % 3;
4673
4674 for (i = 0; i < nelt; i++)
4675 {
4676 if (3 * i + nelt0 < nelt)
4677 sel[3 * i + nelt0] = j0++;
4678 if (3 * i + nelt1 < nelt)
4679 sel[3 * i + nelt1] = nelt + j1++;
4680 if (3 * i + nelt2 < nelt)
4681 sel[3 * i + nelt2] = 0;
4682 }
4683 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4684
4685 for (i = 0; i < nelt; i++)
4686 {
4687 if (3 * i + nelt0 < nelt)
4688 sel[3 * i + nelt0] = 3 * i + nelt0;
4689 if (3 * i + nelt1 < nelt)
4690 sel[3 * i + nelt1] = 3 * i + nelt1;
4691 if (3 * i + nelt2 < nelt)
4692 sel[3 * i + nelt2] = nelt + j2++;
4693 }
4694 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4695
4696 vect1 = dr_chain[0];
4697 vect2 = dr_chain[1];
4698
4699 /* Create interleaving stmt:
4700 low = VEC_PERM_EXPR <vect1, vect2,
4701 {j, nelt, *, j + 1, nelt + j + 1, *,
4702 j + 2, nelt + j + 2, *, ...}> */
4703 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
4704 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4705 vect2, perm3_mask_low);
4706 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4707
4708 vect1 = data_ref;
4709 vect2 = dr_chain[2];
4710 /* Create interleaving stmt:
4711 low = VEC_PERM_EXPR <vect1, vect2,
4712 {0, 1, nelt + j, 3, 4, nelt + j + 1,
4713 6, 7, nelt + j + 2, ...}> */
4714 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
4715 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect1,
4716 vect2, perm3_mask_high);
4717 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4718 (*result_chain)[j] = data_ref;
4719 }
4720 }
4721 else
4722 {
4723 /* If length is not equal to 3 then only power of 2 is supported. */
4724 gcc_assert (exact_log2 (length) != -1);
4725
4726 for (i = 0, n = nelt / 2; i < n; i++)
4727 {
4728 sel[i * 2] = i;
4729 sel[i * 2 + 1] = i + nelt;
4730 }
4731 perm_mask_high = vect_gen_perm_mask_checked (vectype, sel);
4732
4733 for (i = 0; i < nelt; i++)
4734 sel[i] += nelt / 2;
4735 perm_mask_low = vect_gen_perm_mask_checked (vectype, sel);
4736
4737 for (i = 0, n = log_length; i < n; i++)
4738 {
4739 for (j = 0; j < length/2; j++)
4740 {
4741 vect1 = dr_chain[j];
4742 vect2 = dr_chain[j+length/2];
4743
4744 /* Create interleaving stmt:
4745 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1,
4746 ...}> */
4747 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4748 perm_stmt = gimple_build_assign (high, VEC_PERM_EXPR, vect1,
4749 vect2, perm_mask_high);
4750 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4751 (*result_chain)[2*j] = high;
4752
4753 /* Create interleaving stmt:
4754 low = VEC_PERM_EXPR <vect1, vect2,
4755 {nelt/2, nelt*3/2, nelt/2+1, nelt*3/2+1,
4756 ...}> */
4757 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4758 perm_stmt = gimple_build_assign (low, VEC_PERM_EXPR, vect1,
4759 vect2, perm_mask_low);
4760 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4761 (*result_chain)[2*j+1] = low;
4762 }
4763 memcpy (dr_chain.address (), result_chain->address (),
4764 length * sizeof (tree));
4765 }
4766 }
4767 }
4768
4769 /* Function vect_setup_realignment
4770
4771 This function is called when vectorizing an unaligned load using
4772 the dr_explicit_realign[_optimized] scheme.
4773 This function generates the following code at the loop prolog:
4774
4775 p = initial_addr;
4776 x msq_init = *(floor(p)); # prolog load
4777 realignment_token = call target_builtin;
4778 loop:
4779 x msq = phi (msq_init, ---)
4780
4781 The stmts marked with x are generated only for the case of
4782 dr_explicit_realign_optimized.
4783
4784 The code above sets up a new (vector) pointer, pointing to the first
4785 location accessed by STMT, and a "floor-aligned" load using that pointer.
4786 It also generates code to compute the "realignment-token" (if the relevant
4787 target hook was defined), and creates a phi-node at the loop-header bb
4788 whose arguments are the result of the prolog-load (created by this
4789 function) and the result of a load that takes place in the loop (to be
4790 created by the caller to this function).
4791
4792 For the case of dr_explicit_realign_optimized:
4793 The caller to this function uses the phi-result (msq) to create the
4794 realignment code inside the loop, and sets up the missing phi argument,
4795 as follows:
4796 loop:
4797 msq = phi (msq_init, lsq)
4798 lsq = *(floor(p')); # load in loop
4799 result = realign_load (msq, lsq, realignment_token);
4800
4801 For the case of dr_explicit_realign:
4802 loop:
4803 msq = *(floor(p)); # load in loop
4804 p' = p + (VS-1);
4805 lsq = *(floor(p')); # load in loop
4806 result = realign_load (msq, lsq, realignment_token);
4807
4808 Input:
4809 STMT - (scalar) load stmt to be vectorized. This load accesses
4810 a memory location that may be unaligned.
4811 BSI - place where new code is to be inserted.
4812 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4813 is used.
4814
4815 Output:
4816 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4817 target hook, if defined.
4818 Return value - the result of the loop-header phi node. */
4819
4820 tree
4821 vect_setup_realignment (gimple *stmt, gimple_stmt_iterator *gsi,
4822 tree *realignment_token,
4823 enum dr_alignment_support alignment_support_scheme,
4824 tree init_addr,
4825 struct loop **at_loop)
4826 {
4827 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4828 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4829 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4830 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4831 struct loop *loop = NULL;
4832 edge pe = NULL;
4833 tree scalar_dest = gimple_assign_lhs (stmt);
4834 tree vec_dest;
4835 gimple *inc;
4836 tree ptr;
4837 tree data_ref;
4838 basic_block new_bb;
4839 tree msq_init = NULL_TREE;
4840 tree new_temp;
4841 gphi *phi_stmt;
4842 tree msq = NULL_TREE;
4843 gimple_seq stmts = NULL;
4844 bool inv_p;
4845 bool compute_in_loop = false;
4846 bool nested_in_vect_loop = false;
4847 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4848 struct loop *loop_for_initial_load = NULL;
4849
4850 if (loop_vinfo)
4851 {
4852 loop = LOOP_VINFO_LOOP (loop_vinfo);
4853 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4854 }
4855
4856 gcc_assert (alignment_support_scheme == dr_explicit_realign
4857 || alignment_support_scheme == dr_explicit_realign_optimized);
4858
4859 /* We need to generate three things:
4860 1. the misalignment computation
4861 2. the extra vector load (for the optimized realignment scheme).
4862 3. the phi node for the two vectors from which the realignment is
4863 done (for the optimized realignment scheme). */
4864
4865 /* 1. Determine where to generate the misalignment computation.
4866
4867 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4868 calculation will be generated by this function, outside the loop (in the
4869 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4870 caller, inside the loop.
4871
4872 Background: If the misalignment remains fixed throughout the iterations of
4873 the loop, then both realignment schemes are applicable, and also the
4874 misalignment computation can be done outside LOOP. This is because we are
4875 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4876 are a multiple of VS (the Vector Size), and therefore the misalignment in
4877 different vectorized LOOP iterations is always the same.
4878 The problem arises only if the memory access is in an inner-loop nested
4879 inside LOOP, which is now being vectorized using outer-loop vectorization.
4880 This is the only case when the misalignment of the memory access may not
4881 remain fixed throughout the iterations of the inner-loop (as explained in
4882 detail in vect_supportable_dr_alignment). In this case, not only is the
4883 optimized realignment scheme not applicable, but also the misalignment
4884 computation (and generation of the realignment token that is passed to
4885 REALIGN_LOAD) have to be done inside the loop.
4886
4887 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4888 or not, which in turn determines if the misalignment is computed inside
4889 the inner-loop, or outside LOOP. */
4890
4891 if (init_addr != NULL_TREE || !loop_vinfo)
4892 {
4893 compute_in_loop = true;
4894 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4895 }
4896
4897
4898 /* 2. Determine where to generate the extra vector load.
4899
4900 For the optimized realignment scheme, instead of generating two vector
4901 loads in each iteration, we generate a single extra vector load in the
4902 preheader of the loop, and in each iteration reuse the result of the
4903 vector load from the previous iteration. In case the memory access is in
4904 an inner-loop nested inside LOOP, which is now being vectorized using
4905 outer-loop vectorization, we need to determine whether this initial vector
4906 load should be generated at the preheader of the inner-loop, or can be
4907 generated at the preheader of LOOP. If the memory access has no evolution
4908 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4909 to be generated inside LOOP (in the preheader of the inner-loop). */
4910
4911 if (nested_in_vect_loop)
4912 {
4913 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4914 bool invariant_in_outerloop =
4915 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4916 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4917 }
4918 else
4919 loop_for_initial_load = loop;
4920 if (at_loop)
4921 *at_loop = loop_for_initial_load;
4922
4923 if (loop_for_initial_load)
4924 pe = loop_preheader_edge (loop_for_initial_load);
4925
4926 /* 3. For the case of the optimized realignment, create the first vector
4927 load at the loop preheader. */
4928
4929 if (alignment_support_scheme == dr_explicit_realign_optimized)
4930 {
4931 /* Create msq_init = *(floor(p1)) in the loop preheader */
4932 gassign *new_stmt;
4933
4934 gcc_assert (!compute_in_loop);
4935 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4936 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4937 NULL_TREE, &init_addr, NULL, &inc,
4938 true, &inv_p);
4939 if (TREE_CODE (ptr) == SSA_NAME)
4940 new_temp = copy_ssa_name (ptr);
4941 else
4942 new_temp = make_ssa_name (TREE_TYPE (ptr));
4943 new_stmt = gimple_build_assign
4944 (new_temp, BIT_AND_EXPR, ptr,
4945 build_int_cst (TREE_TYPE (ptr),
4946 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4947 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4948 gcc_assert (!new_bb);
4949 data_ref
4950 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4951 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4952 new_stmt = gimple_build_assign (vec_dest, data_ref);
4953 new_temp = make_ssa_name (vec_dest, new_stmt);
4954 gimple_assign_set_lhs (new_stmt, new_temp);
4955 if (pe)
4956 {
4957 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4958 gcc_assert (!new_bb);
4959 }
4960 else
4961 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4962
4963 msq_init = gimple_assign_lhs (new_stmt);
4964 }
4965
4966 /* 4. Create realignment token using a target builtin, if available.
4967 It is done either inside the containing loop, or before LOOP (as
4968 determined above). */
4969
4970 if (targetm.vectorize.builtin_mask_for_load)
4971 {
4972 gcall *new_stmt;
4973 tree builtin_decl;
4974
4975 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4976 if (!init_addr)
4977 {
4978 /* Generate the INIT_ADDR computation outside LOOP. */
4979 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4980 NULL_TREE, loop);
4981 if (loop)
4982 {
4983 pe = loop_preheader_edge (loop);
4984 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4985 gcc_assert (!new_bb);
4986 }
4987 else
4988 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4989 }
4990
4991 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4992 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4993 vec_dest =
4994 vect_create_destination_var (scalar_dest,
4995 gimple_call_return_type (new_stmt));
4996 new_temp = make_ssa_name (vec_dest, new_stmt);
4997 gimple_call_set_lhs (new_stmt, new_temp);
4998
4999 if (compute_in_loop)
5000 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5001 else
5002 {
5003 /* Generate the misalignment computation outside LOOP. */
5004 pe = loop_preheader_edge (loop);
5005 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
5006 gcc_assert (!new_bb);
5007 }
5008
5009 *realignment_token = gimple_call_lhs (new_stmt);
5010
5011 /* The result of the CALL_EXPR to this builtin is determined from
5012 the value of the parameter and no global variables are touched
5013 which makes the builtin a "const" function. Requiring the
5014 builtin to have the "const" attribute makes it unnecessary
5015 to call mark_call_clobbered. */
5016 gcc_assert (TREE_READONLY (builtin_decl));
5017 }
5018
5019 if (alignment_support_scheme == dr_explicit_realign)
5020 return msq;
5021
5022 gcc_assert (!compute_in_loop);
5023 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
5024
5025
5026 /* 5. Create msq = phi <msq_init, lsq> in loop */
5027
5028 pe = loop_preheader_edge (containing_loop);
5029 vec_dest = vect_create_destination_var (scalar_dest, vectype);
5030 msq = make_ssa_name (vec_dest);
5031 phi_stmt = create_phi_node (msq, containing_loop->header);
5032 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
5033
5034 return msq;
5035 }
5036
5037
5038 /* Function vect_grouped_load_supported.
5039
5040 Returns TRUE if even and odd permutations are supported,
5041 and FALSE otherwise. */
5042
5043 bool
5044 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
5045 {
5046 machine_mode mode = TYPE_MODE (vectype);
5047
5048 /* vect_permute_load_chain requires the group size to be equal to 3 or
5049 be a power of two. */
5050 if (count != 3 && exact_log2 (count) == -1)
5051 {
5052 if (dump_enabled_p ())
5053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5054 "the size of the group of accesses"
5055 " is not a power of 2 or not equal to 3\n");
5056 return false;
5057 }
5058
5059 /* Check that the permutation is supported. */
5060 if (VECTOR_MODE_P (mode))
5061 {
5062 unsigned int i, j, nelt = GET_MODE_NUNITS (mode);
5063 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5064
5065 if (count == 3)
5066 {
5067 unsigned int k;
5068 for (k = 0; k < 3; k++)
5069 {
5070 for (i = 0; i < nelt; i++)
5071 if (3 * i + k < 2 * nelt)
5072 sel[i] = 3 * i + k;
5073 else
5074 sel[i] = 0;
5075 if (!can_vec_perm_p (mode, false, sel))
5076 {
5077 if (dump_enabled_p ())
5078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5079 "shuffle of 3 loads is not supported by"
5080 " target\n");
5081 return false;
5082 }
5083 for (i = 0, j = 0; i < nelt; i++)
5084 if (3 * i + k < 2 * nelt)
5085 sel[i] = i;
5086 else
5087 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5088 if (!can_vec_perm_p (mode, false, sel))
5089 {
5090 if (dump_enabled_p ())
5091 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5092 "shuffle of 3 loads is not supported by"
5093 " target\n");
5094 return false;
5095 }
5096 }
5097 return true;
5098 }
5099 else
5100 {
5101 /* If length is not equal to 3 then only power of 2 is supported. */
5102 gcc_assert (exact_log2 (count) != -1);
5103 for (i = 0; i < nelt; i++)
5104 sel[i] = i * 2;
5105 if (can_vec_perm_p (mode, false, sel))
5106 {
5107 for (i = 0; i < nelt; i++)
5108 sel[i] = i * 2 + 1;
5109 if (can_vec_perm_p (mode, false, sel))
5110 return true;
5111 }
5112 }
5113 }
5114
5115 if (dump_enabled_p ())
5116 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5117 "extract even/odd not supported by target\n");
5118 return false;
5119 }
5120
5121 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
5122 type VECTYPE. */
5123
5124 bool
5125 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
5126 {
5127 return vect_lanes_optab_supported_p ("vec_load_lanes",
5128 vec_load_lanes_optab,
5129 vectype, count);
5130 }
5131
5132 /* Function vect_permute_load_chain.
5133
5134 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
5135 a power of 2 or equal to 3, generate extract_even/odd stmts to reorder
5136 the input data correctly. Return the final references for loads in
5137 RESULT_CHAIN.
5138
5139 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
5140 The input is 4 vectors each containing 8 elements. We assign a number to each
5141 element, the input sequence is:
5142
5143 1st vec: 0 1 2 3 4 5 6 7
5144 2nd vec: 8 9 10 11 12 13 14 15
5145 3rd vec: 16 17 18 19 20 21 22 23
5146 4th vec: 24 25 26 27 28 29 30 31
5147
5148 The output sequence should be:
5149
5150 1st vec: 0 4 8 12 16 20 24 28
5151 2nd vec: 1 5 9 13 17 21 25 29
5152 3rd vec: 2 6 10 14 18 22 26 30
5153 4th vec: 3 7 11 15 19 23 27 31
5154
5155 i.e., the first output vector should contain the first elements of each
5156 interleaving group, etc.
5157
5158 We use extract_even/odd instructions to create such output. The input of
5159 each extract_even/odd operation is two vectors
5160 1st vec 2nd vec
5161 0 1 2 3 4 5 6 7
5162
5163 and the output is the vector of extracted even/odd elements. The output of
5164 extract_even will be: 0 2 4 6
5165 and of extract_odd: 1 3 5 7
5166
5167
5168 The permutation is done in log LENGTH stages. In each stage extract_even
5169 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
5170 their order. In our example,
5171
5172 E1: extract_even (1st vec, 2nd vec)
5173 E2: extract_odd (1st vec, 2nd vec)
5174 E3: extract_even (3rd vec, 4th vec)
5175 E4: extract_odd (3rd vec, 4th vec)
5176
5177 The output for the first stage will be:
5178
5179 E1: 0 2 4 6 8 10 12 14
5180 E2: 1 3 5 7 9 11 13 15
5181 E3: 16 18 20 22 24 26 28 30
5182 E4: 17 19 21 23 25 27 29 31
5183
5184 In order to proceed and create the correct sequence for the next stage (or
5185 for the correct output, if the second stage is the last one, as in our
5186 example), we first put the output of extract_even operation and then the
5187 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
5188 The input for the second stage is:
5189
5190 1st vec (E1): 0 2 4 6 8 10 12 14
5191 2nd vec (E3): 16 18 20 22 24 26 28 30
5192 3rd vec (E2): 1 3 5 7 9 11 13 15
5193 4th vec (E4): 17 19 21 23 25 27 29 31
5194
5195 The output of the second stage:
5196
5197 E1: 0 4 8 12 16 20 24 28
5198 E2: 2 6 10 14 18 22 26 30
5199 E3: 1 5 9 13 17 21 25 29
5200 E4: 3 7 11 15 19 23 27 31
5201
5202 And RESULT_CHAIN after reordering:
5203
5204 1st vec (E1): 0 4 8 12 16 20 24 28
5205 2nd vec (E3): 1 5 9 13 17 21 25 29
5206 3rd vec (E2): 2 6 10 14 18 22 26 30
5207 4th vec (E4): 3 7 11 15 19 23 27 31. */
5208
5209 static void
5210 vect_permute_load_chain (vec<tree> dr_chain,
5211 unsigned int length,
5212 gimple *stmt,
5213 gimple_stmt_iterator *gsi,
5214 vec<tree> *result_chain)
5215 {
5216 tree data_ref, first_vect, second_vect;
5217 tree perm_mask_even, perm_mask_odd;
5218 tree perm3_mask_low, perm3_mask_high;
5219 gimple *perm_stmt;
5220 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5221 unsigned int i, j, log_length = exact_log2 (length);
5222 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5223 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5224
5225 result_chain->quick_grow (length);
5226 memcpy (result_chain->address (), dr_chain.address (),
5227 length * sizeof (tree));
5228
5229 if (length == 3)
5230 {
5231 unsigned int k;
5232
5233 for (k = 0; k < 3; k++)
5234 {
5235 for (i = 0; i < nelt; i++)
5236 if (3 * i + k < 2 * nelt)
5237 sel[i] = 3 * i + k;
5238 else
5239 sel[i] = 0;
5240 perm3_mask_low = vect_gen_perm_mask_checked (vectype, sel);
5241
5242 for (i = 0, j = 0; i < nelt; i++)
5243 if (3 * i + k < 2 * nelt)
5244 sel[i] = i;
5245 else
5246 sel[i] = nelt + ((nelt + k) % 3) + 3 * (j++);
5247
5248 perm3_mask_high = vect_gen_perm_mask_checked (vectype, sel);
5249
5250 first_vect = dr_chain[0];
5251 second_vect = dr_chain[1];
5252
5253 /* Create interleaving stmt (low part of):
5254 low = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5255 ...}> */
5256 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_low");
5257 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5258 second_vect, perm3_mask_low);
5259 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5260
5261 /* Create interleaving stmt (high part of):
5262 high = VEC_PERM_EXPR <first_vect, second_vect2, {k, 3 + k, 6 + k,
5263 ...}> */
5264 first_vect = data_ref;
5265 second_vect = dr_chain[2];
5266 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3_high");
5267 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, first_vect,
5268 second_vect, perm3_mask_high);
5269 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5270 (*result_chain)[k] = data_ref;
5271 }
5272 }
5273 else
5274 {
5275 /* If length is not equal to 3 then only power of 2 is supported. */
5276 gcc_assert (exact_log2 (length) != -1);
5277
5278 for (i = 0; i < nelt; ++i)
5279 sel[i] = i * 2;
5280 perm_mask_even = vect_gen_perm_mask_checked (vectype, sel);
5281
5282 for (i = 0; i < nelt; ++i)
5283 sel[i] = i * 2 + 1;
5284 perm_mask_odd = vect_gen_perm_mask_checked (vectype, sel);
5285
5286 for (i = 0; i < log_length; i++)
5287 {
5288 for (j = 0; j < length; j += 2)
5289 {
5290 first_vect = dr_chain[j];
5291 second_vect = dr_chain[j+1];
5292
5293 /* data_ref = permute_even (first_data_ref, second_data_ref); */
5294 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
5295 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5296 first_vect, second_vect,
5297 perm_mask_even);
5298 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5299 (*result_chain)[j/2] = data_ref;
5300
5301 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
5302 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
5303 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5304 first_vect, second_vect,
5305 perm_mask_odd);
5306 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5307 (*result_chain)[j/2+length/2] = data_ref;
5308 }
5309 memcpy (dr_chain.address (), result_chain->address (),
5310 length * sizeof (tree));
5311 }
5312 }
5313 }
5314
5315 /* Function vect_shift_permute_load_chain.
5316
5317 Given a chain of loads in DR_CHAIN of LENGTH 2 or 3, generate
5318 sequence of stmts to reorder the input data accordingly.
5319 Return the final references for loads in RESULT_CHAIN.
5320 Return true if successed, false otherwise.
5321
5322 E.g., LENGTH is 3 and the scalar type is short, i.e., VF is 8.
5323 The input is 3 vectors each containing 8 elements. We assign a
5324 number to each element, the input sequence is:
5325
5326 1st vec: 0 1 2 3 4 5 6 7
5327 2nd vec: 8 9 10 11 12 13 14 15
5328 3rd vec: 16 17 18 19 20 21 22 23
5329
5330 The output sequence should be:
5331
5332 1st vec: 0 3 6 9 12 15 18 21
5333 2nd vec: 1 4 7 10 13 16 19 22
5334 3rd vec: 2 5 8 11 14 17 20 23
5335
5336 We use 3 shuffle instructions and 3 * 3 - 1 shifts to create such output.
5337
5338 First we shuffle all 3 vectors to get correct elements order:
5339
5340 1st vec: ( 0 3 6) ( 1 4 7) ( 2 5)
5341 2nd vec: ( 8 11 14) ( 9 12 15) (10 13)
5342 3rd vec: (16 19 22) (17 20 23) (18 21)
5343
5344 Next we unite and shift vector 3 times:
5345
5346 1st step:
5347 shift right by 6 the concatenation of:
5348 "1st vec" and "2nd vec"
5349 ( 0 3 6) ( 1 4 7) |( 2 5) _ ( 8 11 14) ( 9 12 15)| (10 13)
5350 "2nd vec" and "3rd vec"
5351 ( 8 11 14) ( 9 12 15) |(10 13) _ (16 19 22) (17 20 23)| (18 21)
5352 "3rd vec" and "1st vec"
5353 (16 19 22) (17 20 23) |(18 21) _ ( 0 3 6) ( 1 4 7)| ( 2 5)
5354 | New vectors |
5355
5356 So that now new vectors are:
5357
5358 1st vec: ( 2 5) ( 8 11 14) ( 9 12 15)
5359 2nd vec: (10 13) (16 19 22) (17 20 23)
5360 3rd vec: (18 21) ( 0 3 6) ( 1 4 7)
5361
5362 2nd step:
5363 shift right by 5 the concatenation of:
5364 "1st vec" and "3rd vec"
5365 ( 2 5) ( 8 11 14) |( 9 12 15) _ (18 21) ( 0 3 6)| ( 1 4 7)
5366 "2nd vec" and "1st vec"
5367 (10 13) (16 19 22) |(17 20 23) _ ( 2 5) ( 8 11 14)| ( 9 12 15)
5368 "3rd vec" and "2nd vec"
5369 (18 21) ( 0 3 6) |( 1 4 7) _ (10 13) (16 19 22)| (17 20 23)
5370 | New vectors |
5371
5372 So that now new vectors are:
5373
5374 1st vec: ( 9 12 15) (18 21) ( 0 3 6)
5375 2nd vec: (17 20 23) ( 2 5) ( 8 11 14)
5376 3rd vec: ( 1 4 7) (10 13) (16 19 22) READY
5377
5378 3rd step:
5379 shift right by 5 the concatenation of:
5380 "1st vec" and "1st vec"
5381 ( 9 12 15) (18 21) |( 0 3 6) _ ( 9 12 15) (18 21)| ( 0 3 6)
5382 shift right by 3 the concatenation of:
5383 "2nd vec" and "2nd vec"
5384 (17 20 23) |( 2 5) ( 8 11 14) _ (17 20 23)| ( 2 5) ( 8 11 14)
5385 | New vectors |
5386
5387 So that now all vectors are READY:
5388 1st vec: ( 0 3 6) ( 9 12 15) (18 21)
5389 2nd vec: ( 2 5) ( 8 11 14) (17 20 23)
5390 3rd vec: ( 1 4 7) (10 13) (16 19 22)
5391
5392 This algorithm is faster than one in vect_permute_load_chain if:
5393 1. "shift of a concatination" is faster than general permutation.
5394 This is usually so.
5395 2. The TARGET machine can't execute vector instructions in parallel.
5396 This is because each step of the algorithm depends on previous.
5397 The algorithm in vect_permute_load_chain is much more parallel.
5398
5399 The algorithm is applicable only for LOAD CHAIN LENGTH less than VF.
5400 */
5401
5402 static bool
5403 vect_shift_permute_load_chain (vec<tree> dr_chain,
5404 unsigned int length,
5405 gimple *stmt,
5406 gimple_stmt_iterator *gsi,
5407 vec<tree> *result_chain)
5408 {
5409 tree vect[3], vect_shift[3], data_ref, first_vect, second_vect;
5410 tree perm2_mask1, perm2_mask2, perm3_mask;
5411 tree select_mask, shift1_mask, shift2_mask, shift3_mask, shift4_mask;
5412 gimple *perm_stmt;
5413
5414 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
5415 unsigned int i;
5416 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
5417 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
5418 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5419 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5420
5421 result_chain->quick_grow (length);
5422 memcpy (result_chain->address (), dr_chain.address (),
5423 length * sizeof (tree));
5424
5425 if (exact_log2 (length) != -1 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 4)
5426 {
5427 unsigned int j, log_length = exact_log2 (length);
5428 for (i = 0; i < nelt / 2; ++i)
5429 sel[i] = i * 2;
5430 for (i = 0; i < nelt / 2; ++i)
5431 sel[nelt / 2 + i] = i * 2 + 1;
5432 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5433 {
5434 if (dump_enabled_p ())
5435 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5436 "shuffle of 2 fields structure is not \
5437 supported by target\n");
5438 return false;
5439 }
5440 perm2_mask1 = vect_gen_perm_mask_checked (vectype, sel);
5441
5442 for (i = 0; i < nelt / 2; ++i)
5443 sel[i] = i * 2 + 1;
5444 for (i = 0; i < nelt / 2; ++i)
5445 sel[nelt / 2 + i] = i * 2;
5446 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5447 {
5448 if (dump_enabled_p ())
5449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5450 "shuffle of 2 fields structure is not \
5451 supported by target\n");
5452 return false;
5453 }
5454 perm2_mask2 = vect_gen_perm_mask_checked (vectype, sel);
5455
5456 /* Generating permutation constant to shift all elements.
5457 For vector length 8 it is {4 5 6 7 8 9 10 11}. */
5458 for (i = 0; i < nelt; i++)
5459 sel[i] = nelt / 2 + i;
5460 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5461 {
5462 if (dump_enabled_p ())
5463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5464 "shift permutation is not supported by target\n");
5465 return false;
5466 }
5467 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5468
5469 /* Generating permutation constant to select vector from 2.
5470 For vector length 8 it is {0 1 2 3 12 13 14 15}. */
5471 for (i = 0; i < nelt / 2; i++)
5472 sel[i] = i;
5473 for (i = nelt / 2; i < nelt; i++)
5474 sel[i] = nelt + i;
5475 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5476 {
5477 if (dump_enabled_p ())
5478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5479 "select is not supported by target\n");
5480 return false;
5481 }
5482 select_mask = vect_gen_perm_mask_checked (vectype, sel);
5483
5484 for (i = 0; i < log_length; i++)
5485 {
5486 for (j = 0; j < length; j += 2)
5487 {
5488 first_vect = dr_chain[j];
5489 second_vect = dr_chain[j + 1];
5490
5491 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5492 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5493 first_vect, first_vect,
5494 perm2_mask1);
5495 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5496 vect[0] = data_ref;
5497
5498 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle2");
5499 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5500 second_vect, second_vect,
5501 perm2_mask2);
5502 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5503 vect[1] = data_ref;
5504
5505 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift");
5506 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5507 vect[0], vect[1], shift1_mask);
5508 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5509 (*result_chain)[j/2 + length/2] = data_ref;
5510
5511 data_ref = make_temp_ssa_name (vectype, NULL, "vect_select");
5512 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5513 vect[0], vect[1], select_mask);
5514 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5515 (*result_chain)[j/2] = data_ref;
5516 }
5517 memcpy (dr_chain.address (), result_chain->address (),
5518 length * sizeof (tree));
5519 }
5520 return true;
5521 }
5522 if (length == 3 && LOOP_VINFO_VECT_FACTOR (loop_vinfo) > 2)
5523 {
5524 unsigned int k = 0, l = 0;
5525
5526 /* Generating permutation constant to get all elements in rigth order.
5527 For vector length 8 it is {0 3 6 1 4 7 2 5}. */
5528 for (i = 0; i < nelt; i++)
5529 {
5530 if (3 * k + (l % 3) >= nelt)
5531 {
5532 k = 0;
5533 l += (3 - (nelt % 3));
5534 }
5535 sel[i] = 3 * k + (l % 3);
5536 k++;
5537 }
5538 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5539 {
5540 if (dump_enabled_p ())
5541 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5542 "shuffle of 3 fields structure is not \
5543 supported by target\n");
5544 return false;
5545 }
5546 perm3_mask = vect_gen_perm_mask_checked (vectype, sel);
5547
5548 /* Generating permutation constant to shift all elements.
5549 For vector length 8 it is {6 7 8 9 10 11 12 13}. */
5550 for (i = 0; i < nelt; i++)
5551 sel[i] = 2 * (nelt / 3) + (nelt % 3) + i;
5552 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5553 {
5554 if (dump_enabled_p ())
5555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5556 "shift permutation is not supported by target\n");
5557 return false;
5558 }
5559 shift1_mask = vect_gen_perm_mask_checked (vectype, sel);
5560
5561 /* Generating permutation constant to shift all elements.
5562 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5563 for (i = 0; i < nelt; i++)
5564 sel[i] = 2 * (nelt / 3) + 1 + i;
5565 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5566 {
5567 if (dump_enabled_p ())
5568 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5569 "shift permutation is not supported by target\n");
5570 return false;
5571 }
5572 shift2_mask = vect_gen_perm_mask_checked (vectype, sel);
5573
5574 /* Generating permutation constant to shift all elements.
5575 For vector length 8 it is {3 4 5 6 7 8 9 10}. */
5576 for (i = 0; i < nelt; i++)
5577 sel[i] = (nelt / 3) + (nelt % 3) / 2 + i;
5578 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5579 {
5580 if (dump_enabled_p ())
5581 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5582 "shift permutation is not supported by target\n");
5583 return false;
5584 }
5585 shift3_mask = vect_gen_perm_mask_checked (vectype, sel);
5586
5587 /* Generating permutation constant to shift all elements.
5588 For vector length 8 it is {5 6 7 8 9 10 11 12}. */
5589 for (i = 0; i < nelt; i++)
5590 sel[i] = 2 * (nelt / 3) + (nelt % 3) / 2 + i;
5591 if (!can_vec_perm_p (TYPE_MODE (vectype), false, sel))
5592 {
5593 if (dump_enabled_p ())
5594 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5595 "shift permutation is not supported by target\n");
5596 return false;
5597 }
5598 shift4_mask = vect_gen_perm_mask_checked (vectype, sel);
5599
5600 for (k = 0; k < 3; k++)
5601 {
5602 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shuffle3");
5603 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5604 dr_chain[k], dr_chain[k],
5605 perm3_mask);
5606 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5607 vect[k] = data_ref;
5608 }
5609
5610 for (k = 0; k < 3; k++)
5611 {
5612 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift1");
5613 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5614 vect[k % 3], vect[(k + 1) % 3],
5615 shift1_mask);
5616 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5617 vect_shift[k] = data_ref;
5618 }
5619
5620 for (k = 0; k < 3; k++)
5621 {
5622 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift2");
5623 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR,
5624 vect_shift[(4 - k) % 3],
5625 vect_shift[(3 - k) % 3],
5626 shift2_mask);
5627 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5628 vect[k] = data_ref;
5629 }
5630
5631 (*result_chain)[3 - (nelt % 3)] = vect[2];
5632
5633 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift3");
5634 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[0],
5635 vect[0], shift3_mask);
5636 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5637 (*result_chain)[nelt % 3] = data_ref;
5638
5639 data_ref = make_temp_ssa_name (vectype, NULL, "vect_shift4");
5640 perm_stmt = gimple_build_assign (data_ref, VEC_PERM_EXPR, vect[1],
5641 vect[1], shift4_mask);
5642 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
5643 (*result_chain)[0] = data_ref;
5644 return true;
5645 }
5646 return false;
5647 }
5648
5649 /* Function vect_transform_grouped_load.
5650
5651 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
5652 to perform their permutation and ascribe the result vectorized statements to
5653 the scalar statements.
5654 */
5655
5656 void
5657 vect_transform_grouped_load (gimple *stmt, vec<tree> dr_chain, int size,
5658 gimple_stmt_iterator *gsi)
5659 {
5660 machine_mode mode;
5661 vec<tree> result_chain = vNULL;
5662
5663 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
5664 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
5665 vectors, that are ready for vector computation. */
5666 result_chain.create (size);
5667
5668 /* If reassociation width for vector type is 2 or greater target machine can
5669 execute 2 or more vector instructions in parallel. Otherwise try to
5670 get chain for loads group using vect_shift_permute_load_chain. */
5671 mode = TYPE_MODE (STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)));
5672 if (targetm.sched.reassociation_width (VEC_PERM_EXPR, mode) > 1
5673 || exact_log2 (size) != -1
5674 || !vect_shift_permute_load_chain (dr_chain, size, stmt,
5675 gsi, &result_chain))
5676 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
5677 vect_record_grouped_load_vectors (stmt, result_chain);
5678 result_chain.release ();
5679 }
5680
5681 /* RESULT_CHAIN contains the output of a group of grouped loads that were
5682 generated as part of the vectorization of STMT. Assign the statement
5683 for each vector to the associated scalar statement. */
5684
5685 void
5686 vect_record_grouped_load_vectors (gimple *stmt, vec<tree> result_chain)
5687 {
5688 gimple *first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
5689 gimple *next_stmt, *new_stmt;
5690 unsigned int i, gap_count;
5691 tree tmp_data_ref;
5692
5693 /* Put a permuted data-ref in the VECTORIZED_STMT field.
5694 Since we scan the chain starting from it's first node, their order
5695 corresponds the order of data-refs in RESULT_CHAIN. */
5696 next_stmt = first_stmt;
5697 gap_count = 1;
5698 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
5699 {
5700 if (!next_stmt)
5701 break;
5702
5703 /* Skip the gaps. Loads created for the gaps will be removed by dead
5704 code elimination pass later. No need to check for the first stmt in
5705 the group, since it always exists.
5706 GROUP_GAP is the number of steps in elements from the previous
5707 access (if there is no gap GROUP_GAP is 1). We skip loads that
5708 correspond to the gaps. */
5709 if (next_stmt != first_stmt
5710 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
5711 {
5712 gap_count++;
5713 continue;
5714 }
5715
5716 while (next_stmt)
5717 {
5718 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
5719 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
5720 copies, and we put the new vector statement in the first available
5721 RELATED_STMT. */
5722 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
5723 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
5724 else
5725 {
5726 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5727 {
5728 gimple *prev_stmt =
5729 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
5730 gimple *rel_stmt =
5731 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
5732 while (rel_stmt)
5733 {
5734 prev_stmt = rel_stmt;
5735 rel_stmt =
5736 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
5737 }
5738
5739 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
5740 new_stmt;
5741 }
5742 }
5743
5744 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
5745 gap_count = 1;
5746 /* If NEXT_STMT accesses the same DR as the previous statement,
5747 put the same TMP_DATA_REF as its vectorized statement; otherwise
5748 get the next data-ref from RESULT_CHAIN. */
5749 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
5750 break;
5751 }
5752 }
5753 }
5754
5755 /* Function vect_force_dr_alignment_p.
5756
5757 Returns whether the alignment of a DECL can be forced to be aligned
5758 on ALIGNMENT bit boundary. */
5759
5760 bool
5761 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
5762 {
5763 if (TREE_CODE (decl) != VAR_DECL)
5764 return false;
5765
5766 if (decl_in_symtab_p (decl)
5767 && !symtab_node::get (decl)->can_increase_alignment_p ())
5768 return false;
5769
5770 if (TREE_STATIC (decl))
5771 return (alignment <= MAX_OFILE_ALIGNMENT);
5772 else
5773 return (alignment <= MAX_STACK_ALIGNMENT);
5774 }
5775
5776
5777 /* Return whether the data reference DR is supported with respect to its
5778 alignment.
5779 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
5780 it is aligned, i.e., check if it is possible to vectorize it with different
5781 alignment. */
5782
5783 enum dr_alignment_support
5784 vect_supportable_dr_alignment (struct data_reference *dr,
5785 bool check_aligned_accesses)
5786 {
5787 gimple *stmt = DR_STMT (dr);
5788 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5789 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5790 machine_mode mode = TYPE_MODE (vectype);
5791 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5792 struct loop *vect_loop = NULL;
5793 bool nested_in_vect_loop = false;
5794
5795 if (aligned_access_p (dr) && !check_aligned_accesses)
5796 return dr_aligned;
5797
5798 /* For now assume all conditional loads/stores support unaligned
5799 access without any special code. */
5800 if (is_gimple_call (stmt)
5801 && gimple_call_internal_p (stmt)
5802 && (gimple_call_internal_fn (stmt) == IFN_MASK_LOAD
5803 || gimple_call_internal_fn (stmt) == IFN_MASK_STORE))
5804 return dr_unaligned_supported;
5805
5806 if (loop_vinfo)
5807 {
5808 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
5809 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
5810 }
5811
5812 /* Possibly unaligned access. */
5813
5814 /* We can choose between using the implicit realignment scheme (generating
5815 a misaligned_move stmt) and the explicit realignment scheme (generating
5816 aligned loads with a REALIGN_LOAD). There are two variants to the
5817 explicit realignment scheme: optimized, and unoptimized.
5818 We can optimize the realignment only if the step between consecutive
5819 vector loads is equal to the vector size. Since the vector memory
5820 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
5821 is guaranteed that the misalignment amount remains the same throughout the
5822 execution of the vectorized loop. Therefore, we can create the
5823 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
5824 at the loop preheader.
5825
5826 However, in the case of outer-loop vectorization, when vectorizing a
5827 memory access in the inner-loop nested within the LOOP that is now being
5828 vectorized, while it is guaranteed that the misalignment of the
5829 vectorized memory access will remain the same in different outer-loop
5830 iterations, it is *not* guaranteed that is will remain the same throughout
5831 the execution of the inner-loop. This is because the inner-loop advances
5832 with the original scalar step (and not in steps of VS). If the inner-loop
5833 step happens to be a multiple of VS, then the misalignment remains fixed
5834 and we can use the optimized realignment scheme. For example:
5835
5836 for (i=0; i<N; i++)
5837 for (j=0; j<M; j++)
5838 s += a[i+j];
5839
5840 When vectorizing the i-loop in the above example, the step between
5841 consecutive vector loads is 1, and so the misalignment does not remain
5842 fixed across the execution of the inner-loop, and the realignment cannot
5843 be optimized (as illustrated in the following pseudo vectorized loop):
5844
5845 for (i=0; i<N; i+=4)
5846 for (j=0; j<M; j++){
5847 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
5848 // when j is {0,1,2,3,4,5,6,7,...} respectively.
5849 // (assuming that we start from an aligned address).
5850 }
5851
5852 We therefore have to use the unoptimized realignment scheme:
5853
5854 for (i=0; i<N; i+=4)
5855 for (j=k; j<M; j+=4)
5856 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
5857 // that the misalignment of the initial address is
5858 // 0).
5859
5860 The loop can then be vectorized as follows:
5861
5862 for (k=0; k<4; k++){
5863 rt = get_realignment_token (&vp[k]);
5864 for (i=0; i<N; i+=4){
5865 v1 = vp[i+k];
5866 for (j=k; j<M; j+=4){
5867 v2 = vp[i+j+VS-1];
5868 va = REALIGN_LOAD <v1,v2,rt>;
5869 vs += va;
5870 v1 = v2;
5871 }
5872 }
5873 } */
5874
5875 if (DR_IS_READ (dr))
5876 {
5877 bool is_packed = false;
5878 tree type = (TREE_TYPE (DR_REF (dr)));
5879
5880 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
5881 && (!targetm.vectorize.builtin_mask_for_load
5882 || targetm.vectorize.builtin_mask_for_load ()))
5883 {
5884 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5885 if ((nested_in_vect_loop
5886 && (TREE_INT_CST_LOW (DR_STEP (dr))
5887 != GET_MODE_SIZE (TYPE_MODE (vectype))))
5888 || !loop_vinfo)
5889 return dr_explicit_realign;
5890 else
5891 return dr_explicit_realign_optimized;
5892 }
5893 if (!known_alignment_for_access_p (dr))
5894 is_packed = not_size_aligned (DR_REF (dr));
5895
5896 if ((TYPE_USER_ALIGN (type) && !is_packed)
5897 || targetm.vectorize.
5898 support_vector_misalignment (mode, type,
5899 DR_MISALIGNMENT (dr), is_packed))
5900 /* Can't software pipeline the loads, but can at least do them. */
5901 return dr_unaligned_supported;
5902 }
5903 else
5904 {
5905 bool is_packed = false;
5906 tree type = (TREE_TYPE (DR_REF (dr)));
5907
5908 if (!known_alignment_for_access_p (dr))
5909 is_packed = not_size_aligned (DR_REF (dr));
5910
5911 if ((TYPE_USER_ALIGN (type) && !is_packed)
5912 || targetm.vectorize.
5913 support_vector_misalignment (mode, type,
5914 DR_MISALIGNMENT (dr), is_packed))
5915 return dr_unaligned_supported;
5916 }
5917
5918 /* Unsupported. */
5919 return dr_unaligned_unsupported;
5920 }