re PR tree-optimization/56812 (Simple loop is not SLP-vectorized after r196872)
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
44
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
47
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
51 {
52 enum machine_mode mode, array_mode;
53 bool limit_p;
54
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
59
60 if (array_mode == BLKmode)
61 {
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
67 }
68
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
70 {
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
76 }
77
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
82
83 return true;
84 }
85
86
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
103
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
107 {
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
110
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
112
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
118 {
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
120
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
124 }
125
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
129 }
130
131
132 /* Find the place of the data-ref in STMT in the interleaving chain that starts
133 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
134
135 int
136 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
137 {
138 gimple next_stmt = first_stmt;
139 int result = 0;
140
141 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
142 return -1;
143
144 while (next_stmt && next_stmt != stmt)
145 {
146 result++;
147 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
148 }
149
150 if (next_stmt)
151 return result;
152 else
153 return -1;
154 }
155
156
157 /* Check if data references pointed by DR_I and DR_J are same or
158 belong to same interleaving group. Return FALSE if drs are
159 different, otherwise return TRUE. */
160
161 static bool
162 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
163 {
164 gimple stmt_i = DR_STMT (dr_i);
165 gimple stmt_j = DR_STMT (dr_j);
166
167 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
168 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
169 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
170 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
171 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
172 return true;
173 else
174 return false;
175 }
176
177 /* If address ranges represented by DDR_I and DDR_J are equal,
178 return TRUE, otherwise return FALSE. */
179
180 static bool
181 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
182 {
183 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
184 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
185 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
186 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
187 return true;
188 else
189 return false;
190 }
191
192 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
193 tested at run-time. Return TRUE if DDR was successfully inserted.
194 Return false if versioning is not supported. */
195
196 static bool
197 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
198 {
199 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
200
201 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
202 return false;
203
204 if (dump_enabled_p ())
205 {
206 dump_printf_loc (MSG_NOTE, vect_location,
207 "mark for run-time aliasing test between ");
208 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
209 dump_printf (MSG_NOTE, " and ");
210 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
211 }
212
213 if (optimize_loop_nest_for_size_p (loop))
214 {
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not supported when optimizing for size.");
218 return false;
219 }
220
221 /* FORNOW: We don't support versioning with outer-loop vectorization. */
222 if (loop->inner)
223 {
224 if (dump_enabled_p ())
225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
226 "versioning not yet supported for outer-loops.");
227 return false;
228 }
229
230 /* FORNOW: We don't support creating runtime alias tests for non-constant
231 step. */
232 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
233 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
234 {
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
237 "versioning not yet supported for non-constant "
238 "step");
239 return false;
240 }
241
242 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
243 return true;
244 }
245
246
247 /* Function vect_analyze_data_ref_dependence.
248
249 Return TRUE if there (might) exist a dependence between a memory-reference
250 DRA and a memory-reference DRB. When versioning for alias may check a
251 dependence at run-time, return FALSE. Adjust *MAX_VF according to
252 the data dependence. */
253
254 static bool
255 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
256 loop_vec_info loop_vinfo, int *max_vf)
257 {
258 unsigned int i;
259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
260 struct data_reference *dra = DDR_A (ddr);
261 struct data_reference *drb = DDR_B (ddr);
262 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
263 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
264 lambda_vector dist_v;
265 unsigned int loop_depth;
266
267 /* In loop analysis all data references should be vectorizable. */
268 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
269 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
270 gcc_unreachable ();
271
272 /* Independent data accesses. */
273 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
274 return false;
275
276 if (dra == drb
277 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
278 return false;
279
280 /* Unknown data dependence. */
281 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
282 {
283 if (STMT_VINFO_GATHER_P (stmtinfo_a)
284 || STMT_VINFO_GATHER_P (stmtinfo_b))
285 {
286 if (dump_enabled_p ())
287 {
288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
289 "versioning for alias not supported for: "
290 "can't determine dependence between ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (dra));
293 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
295 DR_REF (drb));
296 }
297 return false;
298 }
299
300 if (dump_enabled_p ())
301 {
302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
303 "versioning for alias required: "
304 "can't determine dependence between ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (dra));
307 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (drb));
310 }
311
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
314 }
315
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr) == 0)
318 {
319 if (STMT_VINFO_GATHER_P (stmtinfo_a)
320 || STMT_VINFO_GATHER_P (stmtinfo_b))
321 {
322 if (dump_enabled_p ())
323 {
324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
325 "versioning for alias not supported for: "
326 "bad dist vector for ");
327 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
328 DR_REF (dra));
329 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
330 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
331 DR_REF (drb));
332 }
333 return false;
334 }
335
336 if (dump_enabled_p ())
337 {
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias required: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
342 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
343 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
344 }
345 /* Add to list of ddrs that need to be tested at run-time. */
346 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
347 }
348
349 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
350 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
351 {
352 int dist = dist_v[loop_depth];
353
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance = %d.", dist);
357
358 if (dist == 0)
359 {
360 if (dump_enabled_p ())
361 {
362 dump_printf_loc (MSG_NOTE, vect_location,
363 "dependence distance == 0 between ");
364 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
365 dump_printf (MSG_NOTE, " and ");
366 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
367 }
368
369 /* For interleaving, mark that there is a read-write dependency if
370 necessary. We check before that one of the data-refs is store. */
371 if (DR_IS_READ (dra))
372 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
373 else
374 {
375 if (DR_IS_READ (drb))
376 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
377 }
378
379 continue;
380 }
381
382 if (dist > 0 && DDR_REVERSED_P (ddr))
383 {
384 /* If DDR_REVERSED_P the order of the data-refs in DDR was
385 reversed (to make distance vector positive), and the actual
386 distance is negative. */
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "dependence distance negative.");
390 continue;
391 }
392
393 if (abs (dist) >= 2
394 && abs (dist) < *max_vf)
395 {
396 /* The dependence distance requires reduction of the maximal
397 vectorization factor. */
398 *max_vf = abs (dist);
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location,
401 "adjusting maximal vectorization factor to %i",
402 *max_vf);
403 }
404
405 if (abs (dist) >= *max_vf)
406 {
407 /* Dependence distance does not create dependence, as far as
408 vectorization is concerned, in this case. */
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_NOTE, vect_location,
411 "dependence distance >= VF.");
412 continue;
413 }
414
415 if (dump_enabled_p ())
416 {
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "not vectorized, possible dependence "
419 "between data-refs ");
420 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_NOTE, " and ");
422 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
423 }
424
425 return true;
426 }
427
428 return false;
429 }
430
431 /* Function vect_analyze_data_ref_dependences.
432
433 Examine all the data references in the loop, and make sure there do not
434 exist any data dependences between them. Set *MAX_VF according to
435 the maximum vectorization factor the data dependences allow. */
436
437 bool
438 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
439 {
440 unsigned int i;
441 struct data_dependence_relation *ddr;
442
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "=== vect_analyze_data_ref_dependences ===");
446
447 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
448 &LOOP_VINFO_DDRS (loop_vinfo),
449 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
450 return false;
451
452 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
453 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
454 return false;
455
456 return true;
457 }
458
459
460 /* Function vect_slp_analyze_data_ref_dependence.
461
462 Return TRUE if there (might) exist a dependence between a memory-reference
463 DRA and a memory-reference DRB. When versioning for alias may check a
464 dependence at run-time, return FALSE. Adjust *MAX_VF according to
465 the data dependence. */
466
467 static bool
468 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
469 {
470 struct data_reference *dra = DDR_A (ddr);
471 struct data_reference *drb = DDR_B (ddr);
472
473 /* We need to check dependences of statements marked as unvectorizable
474 as well, they still can prohibit vectorization. */
475
476 /* Independent data accesses. */
477 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
478 return false;
479
480 if (dra == drb)
481 return false;
482
483 /* Read-read is OK. */
484 if (DR_IS_READ (dra) && DR_IS_READ (drb))
485 return false;
486
487 /* If dra and drb are part of the same interleaving chain consider
488 them independent. */
489 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
490 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
491 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
492 return false;
493
494 /* Unknown data dependence. */
495 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
496 {
497 gimple earlier_stmt;
498
499 if (dump_enabled_p ())
500 {
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "can't determine dependence between ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
504 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
506 }
507
508 /* We do not vectorize basic blocks with write-write dependencies. */
509 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
510 return true;
511
512 /* Check that it's not a load-after-store dependence. */
513 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
514 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
515 return true;
516
517 return false;
518 }
519
520 if (dump_enabled_p ())
521 {
522 dump_printf_loc (MSG_NOTE, vect_location,
523 "determined dependence between ");
524 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_NOTE, " and ");
526 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
527 }
528
529 /* Do not vectorize basic blocks with write-write dependences. */
530 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
531 return true;
532
533 /* Check dependence between DRA and DRB for basic block vectorization.
534 If the accesses share same bases and offsets, we can compare their initial
535 constant offsets to decide whether they differ or not. In case of a read-
536 write dependence we check that the load is before the store to ensure that
537 vectorization will not change the order of the accesses. */
538
539 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
540 gimple earlier_stmt;
541
542 /* Check that the data-refs have same bases and offsets. If not, we can't
543 determine if they are dependent. */
544 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
545 || !dr_equal_offsets_p (dra, drb))
546 return true;
547
548 /* Check the types. */
549 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
550 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
551
552 if (type_size_a != type_size_b
553 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
554 TREE_TYPE (DR_REF (drb))))
555 return true;
556
557 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
558 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
559
560 /* Two different locations - no dependence. */
561 if (init_a != init_b)
562 return false;
563
564 /* We have a read-write dependence. Check that the load is before the store.
565 When we vectorize basic blocks, vector load can be only before
566 corresponding scalar load, and vector store can be only after its
567 corresponding scalar store. So the order of the acceses is preserved in
568 case the load is before the store. */
569 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
570 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
571 return false;
572
573 return true;
574 }
575
576
577 /* Function vect_analyze_data_ref_dependences.
578
579 Examine all the data references in the basic-block, and make sure there
580 do not exist any data dependences between them. Set *MAX_VF according to
581 the maximum vectorization factor the data dependences allow. */
582
583 bool
584 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
585 {
586 struct data_dependence_relation *ddr;
587 unsigned int i;
588
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "=== vect_slp_analyze_data_ref_dependences ===");
592
593 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
594 &BB_VINFO_DDRS (bb_vinfo),
595 vNULL, true))
596 return false;
597
598 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
599 if (vect_slp_analyze_data_ref_dependence (ddr))
600 return false;
601
602 return true;
603 }
604
605
606 /* Function vect_compute_data_ref_alignment
607
608 Compute the misalignment of the data reference DR.
609
610 Output:
611 1. If during the misalignment computation it is found that the data reference
612 cannot be vectorized then false is returned.
613 2. DR_MISALIGNMENT (DR) is defined.
614
615 FOR NOW: No analysis is actually performed. Misalignment is calculated
616 only for trivial cases. TODO. */
617
618 static bool
619 vect_compute_data_ref_alignment (struct data_reference *dr)
620 {
621 gimple stmt = DR_STMT (dr);
622 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
623 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
624 struct loop *loop = NULL;
625 tree ref = DR_REF (dr);
626 tree vectype;
627 tree base, base_addr;
628 bool base_aligned;
629 tree misalign;
630 tree aligned_to, alignment;
631
632 if (dump_enabled_p ())
633 dump_printf_loc (MSG_NOTE, vect_location,
634 "vect_compute_data_ref_alignment:");
635
636 if (loop_vinfo)
637 loop = LOOP_VINFO_LOOP (loop_vinfo);
638
639 /* Initialize misalignment to unknown. */
640 SET_DR_MISALIGNMENT (dr, -1);
641
642 /* Strided loads perform only component accesses, misalignment information
643 is irrelevant for them. */
644 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
645 return true;
646
647 misalign = DR_INIT (dr);
648 aligned_to = DR_ALIGNED_TO (dr);
649 base_addr = DR_BASE_ADDRESS (dr);
650 vectype = STMT_VINFO_VECTYPE (stmt_info);
651
652 /* In case the dataref is in an inner-loop of the loop that is being
653 vectorized (LOOP), we use the base and misalignment information
654 relative to the outer-loop (LOOP). This is ok only if the misalignment
655 stays the same throughout the execution of the inner-loop, which is why
656 we have to check that the stride of the dataref in the inner-loop evenly
657 divides by the vector size. */
658 if (loop && nested_in_vect_loop_p (loop, stmt))
659 {
660 tree step = DR_STEP (dr);
661 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
662
663 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_NOTE, vect_location,
667 "inner step divides the vector-size.");
668 misalign = STMT_VINFO_DR_INIT (stmt_info);
669 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
670 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
671 }
672 else
673 {
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 "inner step doesn't divide the vector-size.");
677 misalign = NULL_TREE;
678 }
679 }
680
681 /* Similarly, if we're doing basic-block vectorization, we can only use
682 base and misalignment information relative to an innermost loop if the
683 misalignment stays the same throughout the execution of the loop.
684 As above, this is the case if the stride of the dataref evenly divides
685 by the vector size. */
686 if (!loop)
687 {
688 tree step = DR_STEP (dr);
689 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
690
691 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
692 {
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "SLP: step doesn't divide the vector-size.");
696 misalign = NULL_TREE;
697 }
698 }
699
700 base = build_fold_indirect_ref (base_addr);
701 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
702
703 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
704 || !misalign)
705 {
706 if (dump_enabled_p ())
707 {
708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
709 "Unknown alignment for access: ");
710 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
711 }
712 return true;
713 }
714
715 if ((DECL_P (base)
716 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
717 alignment) >= 0)
718 || (TREE_CODE (base_addr) == SSA_NAME
719 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
720 TREE_TYPE (base_addr)))),
721 alignment) >= 0)
722 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
723 base_aligned = true;
724 else
725 base_aligned = false;
726
727 if (!base_aligned)
728 {
729 /* Do not change the alignment of global variables here if
730 flag_section_anchors is enabled as we already generated
731 RTL for other functions. Most global variables should
732 have been aligned during the IPA increase_alignment pass. */
733 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
734 || (TREE_STATIC (base) && flag_section_anchors))
735 {
736 if (dump_enabled_p ())
737 {
738 dump_printf_loc (MSG_NOTE, vect_location,
739 "can't force alignment of ref: ");
740 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
741 }
742 return true;
743 }
744
745 /* Force the alignment of the decl.
746 NOTE: This is the only change to the code we make during
747 the analysis phase, before deciding to vectorize the loop. */
748 if (dump_enabled_p ())
749 {
750 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
751 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
752 }
753
754 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
755 DECL_USER_ALIGN (base) = 1;
756 }
757
758 /* At this point we assume that the base is aligned. */
759 gcc_assert (base_aligned
760 || (TREE_CODE (base) == VAR_DECL
761 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
762
763 /* If this is a backward running DR then first access in the larger
764 vectype actually is N-1 elements before the address in the DR.
765 Adjust misalign accordingly. */
766 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
767 {
768 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
769 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
770 otherwise we wouldn't be here. */
771 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
772 /* PLUS because DR_STEP was negative. */
773 misalign = size_binop (PLUS_EXPR, misalign, offset);
774 }
775
776 /* Modulo alignment. */
777 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
778
779 if (!host_integerp (misalign, 1))
780 {
781 /* Negative or overflowed misalignment value. */
782 if (dump_enabled_p ())
783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
784 "unexpected misalign value");
785 return false;
786 }
787
788 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
789
790 if (dump_enabled_p ())
791 {
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
794 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
795 }
796
797 return true;
798 }
799
800
801 /* Function vect_compute_data_refs_alignment
802
803 Compute the misalignment of data references in the loop.
804 Return FALSE if a data reference is found that cannot be vectorized. */
805
806 static bool
807 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
808 bb_vec_info bb_vinfo)
809 {
810 vec<data_reference_p> datarefs;
811 struct data_reference *dr;
812 unsigned int i;
813
814 if (loop_vinfo)
815 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
816 else
817 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
818
819 FOR_EACH_VEC_ELT (datarefs, i, dr)
820 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
821 && !vect_compute_data_ref_alignment (dr))
822 {
823 if (bb_vinfo)
824 {
825 /* Mark unsupported statement as unvectorizable. */
826 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
827 continue;
828 }
829 else
830 return false;
831 }
832
833 return true;
834 }
835
836
837 /* Function vect_update_misalignment_for_peel
838
839 DR - the data reference whose misalignment is to be adjusted.
840 DR_PEEL - the data reference whose misalignment is being made
841 zero in the vector loop by the peel.
842 NPEEL - the number of iterations in the peel loop if the misalignment
843 of DR_PEEL is known at compile time. */
844
845 static void
846 vect_update_misalignment_for_peel (struct data_reference *dr,
847 struct data_reference *dr_peel, int npeel)
848 {
849 unsigned int i;
850 vec<dr_p> same_align_drs;
851 struct data_reference *current_dr;
852 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
853 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
854 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
855 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
856
857 /* For interleaved data accesses the step in the loop must be multiplied by
858 the size of the interleaving group. */
859 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
860 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
861 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
862 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
863
864 /* It can be assumed that the data refs with the same alignment as dr_peel
865 are aligned in the vector loop. */
866 same_align_drs
867 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
868 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
869 {
870 if (current_dr != dr)
871 continue;
872 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
873 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
874 SET_DR_MISALIGNMENT (dr, 0);
875 return;
876 }
877
878 if (known_alignment_for_access_p (dr)
879 && known_alignment_for_access_p (dr_peel))
880 {
881 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
882 int misal = DR_MISALIGNMENT (dr);
883 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
884 misal += negative ? -npeel * dr_size : npeel * dr_size;
885 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
886 SET_DR_MISALIGNMENT (dr, misal);
887 return;
888 }
889
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
892 SET_DR_MISALIGNMENT (dr, -1);
893 }
894
895
896 /* Function vect_verify_datarefs_alignment
897
898 Return TRUE if all data references in the loop can be
899 handled with respect to alignment. */
900
901 bool
902 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
903 {
904 vec<data_reference_p> datarefs;
905 struct data_reference *dr;
906 enum dr_alignment_support supportable_dr_alignment;
907 unsigned int i;
908
909 if (loop_vinfo)
910 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
911 else
912 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
913
914 FOR_EACH_VEC_ELT (datarefs, i, dr)
915 {
916 gimple stmt = DR_STMT (dr);
917 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
918
919 if (!STMT_VINFO_RELEVANT_P (stmt_info))
920 continue;
921
922 /* For interleaving, only the alignment of the first access matters.
923 Skip statements marked as not vectorizable. */
924 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
925 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
926 || !STMT_VINFO_VECTORIZABLE (stmt_info))
927 continue;
928
929 /* Strided loads perform only component accesses, alignment is
930 irrelevant for them. */
931 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
932 continue;
933
934 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
935 if (!supportable_dr_alignment)
936 {
937 if (dump_enabled_p ())
938 {
939 if (DR_IS_READ (dr))
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "not vectorized: unsupported unaligned load.");
942 else
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned "
945 "store.");
946
947 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
948 DR_REF (dr));
949 }
950 return false;
951 }
952 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
953 dump_printf_loc (MSG_NOTE, vect_location,
954 "Vectorizing an unaligned access.");
955 }
956 return true;
957 }
958
959 /* Given an memory reference EXP return whether its alignment is less
960 than its size. */
961
962 static bool
963 not_size_aligned (tree exp)
964 {
965 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
966 return true;
967
968 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
969 > get_object_alignment (exp));
970 }
971
972 /* Function vector_alignment_reachable_p
973
974 Return true if vector alignment for DR is reachable by peeling
975 a few loop iterations. Return false otherwise. */
976
977 static bool
978 vector_alignment_reachable_p (struct data_reference *dr)
979 {
980 gimple stmt = DR_STMT (dr);
981 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
982 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
983
984 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
985 {
986 /* For interleaved access we peel only if number of iterations in
987 the prolog loop ({VF - misalignment}), is a multiple of the
988 number of the interleaved accesses. */
989 int elem_size, mis_in_elements;
990 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
991
992 /* FORNOW: handle only known alignment. */
993 if (!known_alignment_for_access_p (dr))
994 return false;
995
996 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
997 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
998
999 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1000 return false;
1001 }
1002
1003 /* If misalignment is known at the compile time then allow peeling
1004 only if natural alignment is reachable through peeling. */
1005 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1006 {
1007 HOST_WIDE_INT elmsize =
1008 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1009 if (dump_enabled_p ())
1010 {
1011 dump_printf_loc (MSG_NOTE, vect_location,
1012 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1013 dump_printf (MSG_NOTE,
1014 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1015 }
1016 if (DR_MISALIGNMENT (dr) % elmsize)
1017 {
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020 "data size does not divide the misalignment.\n");
1021 return false;
1022 }
1023 }
1024
1025 if (!known_alignment_for_access_p (dr))
1026 {
1027 tree type = TREE_TYPE (DR_REF (dr));
1028 bool is_packed = not_size_aligned (DR_REF (dr));
1029 if (dump_enabled_p ())
1030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031 "Unknown misalignment, is_packed = %d",is_packed);
1032 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1033 return true;
1034 else
1035 return false;
1036 }
1037
1038 return true;
1039 }
1040
1041
1042 /* Calculate the cost of the memory access represented by DR. */
1043
1044 static void
1045 vect_get_data_access_cost (struct data_reference *dr,
1046 unsigned int *inside_cost,
1047 unsigned int *outside_cost,
1048 stmt_vector_for_cost *body_cost_vec)
1049 {
1050 gimple stmt = DR_STMT (dr);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1052 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1053 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1054 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1055 int ncopies = vf / nunits;
1056
1057 if (DR_IS_READ (dr))
1058 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1059 NULL, body_cost_vec, false);
1060 else
1061 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1062
1063 if (dump_enabled_p ())
1064 dump_printf_loc (MSG_NOTE, vect_location,
1065 "vect_get_data_access_cost: inside_cost = %d, "
1066 "outside_cost = %d.", *inside_cost, *outside_cost);
1067 }
1068
1069
1070 static hashval_t
1071 vect_peeling_hash (const void *elem)
1072 {
1073 const struct _vect_peel_info *peel_info;
1074
1075 peel_info = (const struct _vect_peel_info *) elem;
1076 return (hashval_t) peel_info->npeel;
1077 }
1078
1079
1080 static int
1081 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1082 {
1083 const struct _vect_peel_info *a, *b;
1084
1085 a = (const struct _vect_peel_info *) elem1;
1086 b = (const struct _vect_peel_info *) elem2;
1087 return (a->npeel == b->npeel);
1088 }
1089
1090
1091 /* Insert DR into peeling hash table with NPEEL as key. */
1092
1093 static void
1094 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1095 int npeel)
1096 {
1097 struct _vect_peel_info elem, *slot;
1098 void **new_slot;
1099 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1100
1101 elem.npeel = npeel;
1102 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1103 &elem);
1104 if (slot)
1105 slot->count++;
1106 else
1107 {
1108 slot = XNEW (struct _vect_peel_info);
1109 slot->npeel = npeel;
1110 slot->dr = dr;
1111 slot->count = 1;
1112 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1113 INSERT);
1114 *new_slot = slot;
1115 }
1116
1117 if (!supportable_dr_alignment && !flag_vect_cost_model)
1118 slot->count += VECT_MAX_COST;
1119 }
1120
1121
1122 /* Traverse peeling hash table to find peeling option that aligns maximum
1123 number of data accesses. */
1124
1125 static int
1126 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1127 {
1128 vect_peel_info elem = (vect_peel_info) *slot;
1129 vect_peel_extended_info max = (vect_peel_extended_info) data;
1130
1131 if (elem->count > max->peel_info.count
1132 || (elem->count == max->peel_info.count
1133 && max->peel_info.npeel > elem->npeel))
1134 {
1135 max->peel_info.npeel = elem->npeel;
1136 max->peel_info.count = elem->count;
1137 max->peel_info.dr = elem->dr;
1138 }
1139
1140 return 1;
1141 }
1142
1143
1144 /* Traverse peeling hash table and calculate cost for each peeling option.
1145 Find the one with the lowest cost. */
1146
1147 static int
1148 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1149 {
1150 vect_peel_info elem = (vect_peel_info) *slot;
1151 vect_peel_extended_info min = (vect_peel_extended_info) data;
1152 int save_misalignment, dummy;
1153 unsigned int inside_cost = 0, outside_cost = 0, i;
1154 gimple stmt = DR_STMT (elem->dr);
1155 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1156 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1157 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1158 struct data_reference *dr;
1159 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1160 int single_iter_cost;
1161
1162 prologue_cost_vec.create (2);
1163 body_cost_vec.create (2);
1164 epilogue_cost_vec.create (2);
1165
1166 FOR_EACH_VEC_ELT (datarefs, i, dr)
1167 {
1168 stmt = DR_STMT (dr);
1169 stmt_info = vinfo_for_stmt (stmt);
1170 /* For interleaving, only the alignment of the first access
1171 matters. */
1172 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1173 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1174 continue;
1175
1176 save_misalignment = DR_MISALIGNMENT (dr);
1177 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1178 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1179 &body_cost_vec);
1180 SET_DR_MISALIGNMENT (dr, save_misalignment);
1181 }
1182
1183 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1184 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1185 &dummy, single_iter_cost,
1186 &prologue_cost_vec,
1187 &epilogue_cost_vec);
1188
1189 /* Prologue and epilogue costs are added to the target model later.
1190 These costs depend only on the scalar iteration cost, the
1191 number of peeling iterations finally chosen, and the number of
1192 misaligned statements. So discard the information found here. */
1193 prologue_cost_vec.release ();
1194 epilogue_cost_vec.release ();
1195
1196 if (inside_cost < min->inside_cost
1197 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1198 {
1199 min->inside_cost = inside_cost;
1200 min->outside_cost = outside_cost;
1201 min->body_cost_vec.release ();
1202 min->body_cost_vec = body_cost_vec;
1203 min->peel_info.dr = elem->dr;
1204 min->peel_info.npeel = elem->npeel;
1205 }
1206 else
1207 body_cost_vec.release ();
1208
1209 return 1;
1210 }
1211
1212
1213 /* Choose best peeling option by traversing peeling hash table and either
1214 choosing an option with the lowest cost (if cost model is enabled) or the
1215 option that aligns as many accesses as possible. */
1216
1217 static struct data_reference *
1218 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1219 unsigned int *npeel,
1220 stmt_vector_for_cost *body_cost_vec)
1221 {
1222 struct _vect_peel_extended_info res;
1223
1224 res.peel_info.dr = NULL;
1225 res.body_cost_vec = stmt_vector_for_cost();
1226
1227 if (flag_vect_cost_model)
1228 {
1229 res.inside_cost = INT_MAX;
1230 res.outside_cost = INT_MAX;
1231 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1232 vect_peeling_hash_get_lowest_cost, &res);
1233 }
1234 else
1235 {
1236 res.peel_info.count = 0;
1237 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1238 vect_peeling_hash_get_most_frequent, &res);
1239 }
1240
1241 *npeel = res.peel_info.npeel;
1242 *body_cost_vec = res.body_cost_vec;
1243 return res.peel_info.dr;
1244 }
1245
1246
1247 /* Function vect_enhance_data_refs_alignment
1248
1249 This pass will use loop versioning and loop peeling in order to enhance
1250 the alignment of data references in the loop.
1251
1252 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253 original loop is to be vectorized. Any other loops that are created by
1254 the transformations performed in this pass - are not supposed to be
1255 vectorized. This restriction will be relaxed.
1256
1257 This pass will require a cost model to guide it whether to apply peeling
1258 or versioning or a combination of the two. For example, the scheme that
1259 intel uses when given a loop with several memory accesses, is as follows:
1260 choose one memory access ('p') which alignment you want to force by doing
1261 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1262 other accesses are not necessarily aligned, or (2) use loop versioning to
1263 generate one loop in which all accesses are aligned, and another loop in
1264 which only 'p' is necessarily aligned.
1265
1266 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1269
1270 Devising a cost model is the most critical aspect of this work. It will
1271 guide us on which access to peel for, whether to use loop versioning, how
1272 many versions to create, etc. The cost model will probably consist of
1273 generic considerations as well as target specific considerations (on
1274 powerpc for example, misaligned stores are more painful than misaligned
1275 loads).
1276
1277 Here are the general steps involved in alignment enhancements:
1278
1279 -- original loop, before alignment analysis:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 }
1284
1285 -- After vect_compute_data_refs_alignment:
1286 for (i=0; i<N; i++){
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1289 }
1290
1291 -- Possibility 1: we do loop versioning:
1292 if (p is aligned) {
1293 for (i=0; i<N; i++){ # loop 1A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1296 }
1297 }
1298 else {
1299 for (i=0; i<N; i++){ # loop 1B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 3
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1302 }
1303 }
1304
1305 -- Possibility 2: we do loop peeling:
1306 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1307 x = q[i];
1308 p[i] = y;
1309 }
1310 for (i = 3; i < N; i++){ # loop 2A
1311 x = q[i]; # DR_MISALIGNMENT(q) = 0
1312 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1313 }
1314
1315 -- Possibility 3: combination of loop peeling and versioning:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1317 x = q[i];
1318 p[i] = y;
1319 }
1320 if (p is aligned) {
1321 for (i = 3; i<N; i++){ # loop 3A
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = 0
1324 }
1325 }
1326 else {
1327 for (i = 3; i<N; i++){ # loop 3B
1328 x = q[i]; # DR_MISALIGNMENT(q) = 0
1329 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1330 }
1331 }
1332
1333 These loops are later passed to loop_transform to be vectorized. The
1334 vectorizer will use the alignment information to guide the transformation
1335 (whether to generate regular loads/stores, or with special handling for
1336 misalignment). */
1337
1338 bool
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1340 {
1341 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1343 enum dr_alignment_support supportable_dr_alignment;
1344 struct data_reference *dr0 = NULL, *first_store = NULL;
1345 struct data_reference *dr;
1346 unsigned int i, j;
1347 bool do_peeling = false;
1348 bool do_versioning = false;
1349 bool stat;
1350 gimple stmt;
1351 stmt_vec_info stmt_info;
1352 int vect_versioning_for_alias_required;
1353 unsigned int npeel = 0;
1354 bool all_misalignments_unknown = true;
1355 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1356 unsigned possible_npeel_number = 1;
1357 tree vectype;
1358 unsigned int nelements, mis, same_align_drs_max = 0;
1359 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1360
1361 if (dump_enabled_p ())
1362 dump_printf_loc (MSG_NOTE, vect_location,
1363 "=== vect_enhance_data_refs_alignment ===");
1364
1365 /* While cost model enhancements are expected in the future, the high level
1366 view of the code at this time is as follows:
1367
1368 A) If there is a misaligned access then see if peeling to align
1369 this access can make all data references satisfy
1370 vect_supportable_dr_alignment. If so, update data structures
1371 as needed and return true.
1372
1373 B) If peeling wasn't possible and there is a data reference with an
1374 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1375 then see if loop versioning checks can be used to make all data
1376 references satisfy vect_supportable_dr_alignment. If so, update
1377 data structures as needed and return true.
1378
1379 C) If neither peeling nor versioning were successful then return false if
1380 any data reference does not satisfy vect_supportable_dr_alignment.
1381
1382 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1383
1384 Note, Possibility 3 above (which is peeling and versioning together) is not
1385 being done at this time. */
1386
1387 /* (1) Peeling to force alignment. */
1388
1389 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1390 Considerations:
1391 + How many accesses will become aligned due to the peeling
1392 - How many accesses will become unaligned due to the peeling,
1393 and the cost of misaligned accesses.
1394 - The cost of peeling (the extra runtime checks, the increase
1395 in code size). */
1396
1397 FOR_EACH_VEC_ELT (datarefs, i, dr)
1398 {
1399 stmt = DR_STMT (dr);
1400 stmt_info = vinfo_for_stmt (stmt);
1401
1402 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1403 continue;
1404
1405 /* For interleaving, only the alignment of the first access
1406 matters. */
1407 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1408 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1409 continue;
1410
1411 /* For invariant accesses there is nothing to enhance. */
1412 if (integer_zerop (DR_STEP (dr)))
1413 continue;
1414
1415 /* Strided loads perform only component accesses, alignment is
1416 irrelevant for them. */
1417 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1418 continue;
1419
1420 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1421 do_peeling = vector_alignment_reachable_p (dr);
1422 if (do_peeling)
1423 {
1424 if (known_alignment_for_access_p (dr))
1425 {
1426 unsigned int npeel_tmp;
1427 bool negative = tree_int_cst_compare (DR_STEP (dr),
1428 size_zero_node) < 0;
1429
1430 /* Save info about DR in the hash table. */
1431 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1432 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1433 htab_create (1, vect_peeling_hash,
1434 vect_peeling_hash_eq, free);
1435
1436 vectype = STMT_VINFO_VECTYPE (stmt_info);
1437 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1438 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1439 TREE_TYPE (DR_REF (dr))));
1440 npeel_tmp = (negative
1441 ? (mis - nelements) : (nelements - mis))
1442 & (nelements - 1);
1443
1444 /* For multiple types, it is possible that the bigger type access
1445 will have more than one peeling option. E.g., a loop with two
1446 types: one of size (vector size / 4), and the other one of
1447 size (vector size / 8). Vectorization factor will 8. If both
1448 access are misaligned by 3, the first one needs one scalar
1449 iteration to be aligned, and the second one needs 5. But the
1450 the first one will be aligned also by peeling 5 scalar
1451 iterations, and in that case both accesses will be aligned.
1452 Hence, except for the immediate peeling amount, we also want
1453 to try to add full vector size, while we don't exceed
1454 vectorization factor.
1455 We do this automtically for cost model, since we calculate cost
1456 for every peeling option. */
1457 if (!flag_vect_cost_model)
1458 possible_npeel_number = vf /nelements;
1459
1460 /* Handle the aligned case. We may decide to align some other
1461 access, making DR unaligned. */
1462 if (DR_MISALIGNMENT (dr) == 0)
1463 {
1464 npeel_tmp = 0;
1465 if (!flag_vect_cost_model)
1466 possible_npeel_number++;
1467 }
1468
1469 for (j = 0; j < possible_npeel_number; j++)
1470 {
1471 gcc_assert (npeel_tmp <= vf);
1472 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1473 npeel_tmp += nelements;
1474 }
1475
1476 all_misalignments_unknown = false;
1477 /* Data-ref that was chosen for the case that all the
1478 misalignments are unknown is not relevant anymore, since we
1479 have a data-ref with known alignment. */
1480 dr0 = NULL;
1481 }
1482 else
1483 {
1484 /* If we don't know all the misalignment values, we prefer
1485 peeling for data-ref that has maximum number of data-refs
1486 with the same alignment, unless the target prefers to align
1487 stores over load. */
1488 if (all_misalignments_unknown)
1489 {
1490 if (same_align_drs_max
1491 < STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ()
1492 || !dr0)
1493 {
1494 same_align_drs_max
1495 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1496 dr0 = dr;
1497 }
1498
1499 if (!first_store && DR_IS_WRITE (dr))
1500 first_store = dr;
1501 }
1502
1503 /* If there are both known and unknown misaligned accesses in the
1504 loop, we choose peeling amount according to the known
1505 accesses. */
1506
1507
1508 if (!supportable_dr_alignment)
1509 {
1510 dr0 = dr;
1511 if (!first_store && DR_IS_WRITE (dr))
1512 first_store = dr;
1513 }
1514 }
1515 }
1516 else
1517 {
1518 if (!aligned_access_p (dr))
1519 {
1520 if (dump_enabled_p ())
1521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1522 "vector alignment may not be reachable");
1523 break;
1524 }
1525 }
1526 }
1527
1528 vect_versioning_for_alias_required
1529 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1530
1531 /* Temporarily, if versioning for alias is required, we disable peeling
1532 until we support peeling and versioning. Often peeling for alignment
1533 will require peeling for loop-bound, which in turn requires that we
1534 know how to adjust the loop ivs after the loop. */
1535 if (vect_versioning_for_alias_required
1536 || !vect_can_advance_ivs_p (loop_vinfo)
1537 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1538 do_peeling = false;
1539
1540 if (do_peeling && all_misalignments_unknown
1541 && vect_supportable_dr_alignment (dr0, false))
1542 {
1543
1544 /* Check if the target requires to prefer stores over loads, i.e., if
1545 misaligned stores are more expensive than misaligned loads (taking
1546 drs with same alignment into account). */
1547 if (first_store && DR_IS_READ (dr0))
1548 {
1549 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1550 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1551 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1552 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1553 stmt_vector_for_cost dummy;
1554 dummy.create (2);
1555
1556 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1557 &dummy);
1558 vect_get_data_access_cost (first_store, &store_inside_cost,
1559 &store_outside_cost, &dummy);
1560
1561 dummy.release ();
1562
1563 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1564 aligning the load DR0). */
1565 load_inside_penalty = store_inside_cost;
1566 load_outside_penalty = store_outside_cost;
1567 for (i = 0;
1568 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1569 DR_STMT (first_store))).iterate (i, &dr);
1570 i++)
1571 if (DR_IS_READ (dr))
1572 {
1573 load_inside_penalty += load_inside_cost;
1574 load_outside_penalty += load_outside_cost;
1575 }
1576 else
1577 {
1578 load_inside_penalty += store_inside_cost;
1579 load_outside_penalty += store_outside_cost;
1580 }
1581
1582 /* Calculate the penalty for leaving DR0 unaligned (by
1583 aligning the FIRST_STORE). */
1584 store_inside_penalty = load_inside_cost;
1585 store_outside_penalty = load_outside_cost;
1586 for (i = 0;
1587 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1588 DR_STMT (dr0))).iterate (i, &dr);
1589 i++)
1590 if (DR_IS_READ (dr))
1591 {
1592 store_inside_penalty += load_inside_cost;
1593 store_outside_penalty += load_outside_cost;
1594 }
1595 else
1596 {
1597 store_inside_penalty += store_inside_cost;
1598 store_outside_penalty += store_outside_cost;
1599 }
1600
1601 if (load_inside_penalty > store_inside_penalty
1602 || (load_inside_penalty == store_inside_penalty
1603 && load_outside_penalty > store_outside_penalty))
1604 dr0 = first_store;
1605 }
1606
1607 /* In case there are only loads with different unknown misalignments, use
1608 peeling only if it may help to align other accesses in the loop. */
1609 if (!first_store
1610 && !STMT_VINFO_SAME_ALIGN_REFS (
1611 vinfo_for_stmt (DR_STMT (dr0))).length ()
1612 && vect_supportable_dr_alignment (dr0, false)
1613 != dr_unaligned_supported)
1614 do_peeling = false;
1615 }
1616
1617 if (do_peeling && !dr0)
1618 {
1619 /* Peeling is possible, but there is no data access that is not supported
1620 unless aligned. So we try to choose the best possible peeling. */
1621
1622 /* We should get here only if there are drs with known misalignment. */
1623 gcc_assert (!all_misalignments_unknown);
1624
1625 /* Choose the best peeling from the hash table. */
1626 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1627 &body_cost_vec);
1628 if (!dr0 || !npeel)
1629 do_peeling = false;
1630 }
1631
1632 if (do_peeling)
1633 {
1634 stmt = DR_STMT (dr0);
1635 stmt_info = vinfo_for_stmt (stmt);
1636 vectype = STMT_VINFO_VECTYPE (stmt_info);
1637 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1638
1639 if (known_alignment_for_access_p (dr0))
1640 {
1641 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1642 size_zero_node) < 0;
1643 if (!npeel)
1644 {
1645 /* Since it's known at compile time, compute the number of
1646 iterations in the peeled loop (the peeling factor) for use in
1647 updating DR_MISALIGNMENT values. The peeling factor is the
1648 vectorization factor minus the misalignment as an element
1649 count. */
1650 mis = DR_MISALIGNMENT (dr0);
1651 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1652 npeel = ((negative ? mis - nelements : nelements - mis)
1653 & (nelements - 1));
1654 }
1655
1656 /* For interleaved data access every iteration accesses all the
1657 members of the group, therefore we divide the number of iterations
1658 by the group size. */
1659 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1660 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1661 npeel /= GROUP_SIZE (stmt_info);
1662
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_NOTE, vect_location,
1665 "Try peeling by %d", npeel);
1666 }
1667
1668 /* Ensure that all data refs can be vectorized after the peel. */
1669 FOR_EACH_VEC_ELT (datarefs, i, dr)
1670 {
1671 int save_misalignment;
1672
1673 if (dr == dr0)
1674 continue;
1675
1676 stmt = DR_STMT (dr);
1677 stmt_info = vinfo_for_stmt (stmt);
1678 /* For interleaving, only the alignment of the first access
1679 matters. */
1680 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1681 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1682 continue;
1683
1684 /* Strided loads perform only component accesses, alignment is
1685 irrelevant for them. */
1686 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1687 continue;
1688
1689 save_misalignment = DR_MISALIGNMENT (dr);
1690 vect_update_misalignment_for_peel (dr, dr0, npeel);
1691 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1692 SET_DR_MISALIGNMENT (dr, save_misalignment);
1693
1694 if (!supportable_dr_alignment)
1695 {
1696 do_peeling = false;
1697 break;
1698 }
1699 }
1700
1701 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1702 {
1703 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1704 if (!stat)
1705 do_peeling = false;
1706 else
1707 {
1708 body_cost_vec.release ();
1709 return stat;
1710 }
1711 }
1712
1713 if (do_peeling)
1714 {
1715 stmt_info_for_cost *si;
1716 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1717
1718 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1719 If the misalignment of DR_i is identical to that of dr0 then set
1720 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1721 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1722 by the peeling factor times the element size of DR_i (MOD the
1723 vectorization factor times the size). Otherwise, the
1724 misalignment of DR_i must be set to unknown. */
1725 FOR_EACH_VEC_ELT (datarefs, i, dr)
1726 if (dr != dr0)
1727 vect_update_misalignment_for_peel (dr, dr0, npeel);
1728
1729 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1730 if (npeel)
1731 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1732 else
1733 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1734 SET_DR_MISALIGNMENT (dr0, 0);
1735 if (dump_enabled_p ())
1736 {
1737 dump_printf_loc (MSG_NOTE, vect_location,
1738 "Alignment of access forced using peeling.");
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "Peeling for alignment will be applied.");
1741 }
1742 /* We've delayed passing the inside-loop peeling costs to the
1743 target cost model until we were sure peeling would happen.
1744 Do so now. */
1745 if (body_cost_vec.exists ())
1746 {
1747 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1748 {
1749 struct _stmt_vec_info *stmt_info
1750 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1751 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1752 si->misalign, vect_body);
1753 }
1754 body_cost_vec.release ();
1755 }
1756
1757 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1758 gcc_assert (stat);
1759 return stat;
1760 }
1761 }
1762
1763 body_cost_vec.release ();
1764
1765 /* (2) Versioning to force alignment. */
1766
1767 /* Try versioning if:
1768 1) flag_tree_vect_loop_version is TRUE
1769 2) optimize loop for speed
1770 3) there is at least one unsupported misaligned data ref with an unknown
1771 misalignment, and
1772 4) all misaligned data refs with a known misalignment are supported, and
1773 5) the number of runtime alignment checks is within reason. */
1774
1775 do_versioning =
1776 flag_tree_vect_loop_version
1777 && optimize_loop_nest_for_speed_p (loop)
1778 && (!loop->inner); /* FORNOW */
1779
1780 if (do_versioning)
1781 {
1782 FOR_EACH_VEC_ELT (datarefs, i, dr)
1783 {
1784 stmt = DR_STMT (dr);
1785 stmt_info = vinfo_for_stmt (stmt);
1786
1787 /* For interleaving, only the alignment of the first access
1788 matters. */
1789 if (aligned_access_p (dr)
1790 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1791 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1792 continue;
1793
1794 /* Strided loads perform only component accesses, alignment is
1795 irrelevant for them. */
1796 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1797 continue;
1798
1799 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1800
1801 if (!supportable_dr_alignment)
1802 {
1803 gimple stmt;
1804 int mask;
1805 tree vectype;
1806
1807 if (known_alignment_for_access_p (dr)
1808 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1809 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1810 {
1811 do_versioning = false;
1812 break;
1813 }
1814
1815 stmt = DR_STMT (dr);
1816 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1817 gcc_assert (vectype);
1818
1819 /* The rightmost bits of an aligned address must be zeros.
1820 Construct the mask needed for this test. For example,
1821 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1822 mask must be 15 = 0xf. */
1823 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1824
1825 /* FORNOW: use the same mask to test all potentially unaligned
1826 references in the loop. The vectorizer currently supports
1827 a single vector size, see the reference to
1828 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1829 vectorization factor is computed. */
1830 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1831 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1832 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1833 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1834 DR_STMT (dr));
1835 }
1836 }
1837
1838 /* Versioning requires at least one misaligned data reference. */
1839 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1840 do_versioning = false;
1841 else if (!do_versioning)
1842 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1843 }
1844
1845 if (do_versioning)
1846 {
1847 vec<gimple> may_misalign_stmts
1848 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1849 gimple stmt;
1850
1851 /* It can now be assumed that the data references in the statements
1852 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1853 of the loop being vectorized. */
1854 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1855 {
1856 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1857 dr = STMT_VINFO_DATA_REF (stmt_info);
1858 SET_DR_MISALIGNMENT (dr, 0);
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location,
1861 "Alignment of access forced using versioning.");
1862 }
1863
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_NOTE, vect_location,
1866 "Versioning for alignment will be applied.");
1867
1868 /* Peeling and versioning can't be done together at this time. */
1869 gcc_assert (! (do_peeling && do_versioning));
1870
1871 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1872 gcc_assert (stat);
1873 return stat;
1874 }
1875
1876 /* This point is reached if neither peeling nor versioning is being done. */
1877 gcc_assert (! (do_peeling || do_versioning));
1878
1879 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1880 return stat;
1881 }
1882
1883
1884 /* Function vect_find_same_alignment_drs.
1885
1886 Update group and alignment relations according to the chosen
1887 vectorization factor. */
1888
1889 static void
1890 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1891 loop_vec_info loop_vinfo)
1892 {
1893 unsigned int i;
1894 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1895 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1896 struct data_reference *dra = DDR_A (ddr);
1897 struct data_reference *drb = DDR_B (ddr);
1898 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1899 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1900 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1901 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1902 lambda_vector dist_v;
1903 unsigned int loop_depth;
1904
1905 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1906 return;
1907
1908 if (dra == drb)
1909 return;
1910
1911 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1912 return;
1913
1914 /* Loop-based vectorization and known data dependence. */
1915 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1916 return;
1917
1918 /* Data-dependence analysis reports a distance vector of zero
1919 for data-references that overlap only in the first iteration
1920 but have different sign step (see PR45764).
1921 So as a sanity check require equal DR_STEP. */
1922 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1923 return;
1924
1925 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1926 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1927 {
1928 int dist = dist_v[loop_depth];
1929
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_NOTE, vect_location,
1932 "dependence distance = %d.", dist);
1933
1934 /* Same loop iteration. */
1935 if (dist == 0
1936 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1937 {
1938 /* Two references with distance zero have the same alignment. */
1939 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1940 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1941 if (dump_enabled_p ())
1942 {
1943 dump_printf_loc (MSG_NOTE, vect_location,
1944 "accesses have the same alignment.");
1945 dump_printf (MSG_NOTE,
1946 "dependence distance modulo vf == 0 between ");
1947 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1948 dump_printf (MSG_NOTE, " and ");
1949 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1950 }
1951 }
1952 }
1953 }
1954
1955
1956 /* Function vect_analyze_data_refs_alignment
1957
1958 Analyze the alignment of the data-references in the loop.
1959 Return FALSE if a data reference is found that cannot be vectorized. */
1960
1961 bool
1962 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1963 bb_vec_info bb_vinfo)
1964 {
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_NOTE, vect_location,
1967 "=== vect_analyze_data_refs_alignment ===");
1968
1969 /* Mark groups of data references with same alignment using
1970 data dependence information. */
1971 if (loop_vinfo)
1972 {
1973 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1974 struct data_dependence_relation *ddr;
1975 unsigned int i;
1976
1977 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1978 vect_find_same_alignment_drs (ddr, loop_vinfo);
1979 }
1980
1981 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1982 {
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985 "not vectorized: can't calculate alignment "
1986 "for data ref.");
1987 return false;
1988 }
1989
1990 return true;
1991 }
1992
1993
1994 /* Analyze groups of accesses: check that DR belongs to a group of
1995 accesses of legal size, step, etc. Detect gaps, single element
1996 interleaving, and other special cases. Set grouped access info.
1997 Collect groups of strided stores for further use in SLP analysis. */
1998
1999 static bool
2000 vect_analyze_group_access (struct data_reference *dr)
2001 {
2002 tree step = DR_STEP (dr);
2003 tree scalar_type = TREE_TYPE (DR_REF (dr));
2004 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2005 gimple stmt = DR_STMT (dr);
2006 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2007 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2008 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2009 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2010 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2011 bool slp_impossible = false;
2012 struct loop *loop = NULL;
2013
2014 if (loop_vinfo)
2015 loop = LOOP_VINFO_LOOP (loop_vinfo);
2016
2017 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2018 size of the interleaving group (including gaps). */
2019 groupsize = dr_step / type_size;
2020
2021 /* Not consecutive access is possible only if it is a part of interleaving. */
2022 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2023 {
2024 /* Check if it this DR is a part of interleaving, and is a single
2025 element of the group that is accessed in the loop. */
2026
2027 /* Gaps are supported only for loads. STEP must be a multiple of the type
2028 size. The size of the group must be a power of 2. */
2029 if (DR_IS_READ (dr)
2030 && (dr_step % type_size) == 0
2031 && groupsize > 0
2032 && exact_log2 (groupsize) != -1)
2033 {
2034 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2035 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2036 if (dump_enabled_p ())
2037 {
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "Detected single element interleaving ");
2040 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2041 dump_printf (MSG_NOTE, " step ");
2042 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2043 }
2044
2045 if (loop_vinfo)
2046 {
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "Data access with gaps requires scalar "
2050 "epilogue loop");
2051 if (loop->inner)
2052 {
2053 if (dump_enabled_p ())
2054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2055 "Peeling for outer loop is not"
2056 " supported");
2057 return false;
2058 }
2059
2060 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2061 }
2062
2063 return true;
2064 }
2065
2066 if (dump_enabled_p ())
2067 {
2068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2069 "not consecutive access ");
2070 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2071 }
2072
2073 if (bb_vinfo)
2074 {
2075 /* Mark the statement as unvectorizable. */
2076 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2077 return true;
2078 }
2079
2080 return false;
2081 }
2082
2083 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2084 {
2085 /* First stmt in the interleaving chain. Check the chain. */
2086 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2087 struct data_reference *data_ref = dr;
2088 unsigned int count = 1;
2089 tree next_step;
2090 tree prev_init = DR_INIT (data_ref);
2091 gimple prev = stmt;
2092 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2093
2094 while (next)
2095 {
2096 /* Skip same data-refs. In case that two or more stmts share
2097 data-ref (supported only for loads), we vectorize only the first
2098 stmt, and the rest get their vectorized loads from the first
2099 one. */
2100 if (!tree_int_cst_compare (DR_INIT (data_ref),
2101 DR_INIT (STMT_VINFO_DATA_REF (
2102 vinfo_for_stmt (next)))))
2103 {
2104 if (DR_IS_WRITE (data_ref))
2105 {
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 "Two store stmts share the same dr.");
2109 return false;
2110 }
2111
2112 /* Check that there is no load-store dependencies for this loads
2113 to prevent a case of load-store-load to the same location. */
2114 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2115 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2116 {
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119 "READ_WRITE dependence in interleaving.");
2120 return false;
2121 }
2122
2123 /* For load use the same data-ref load. */
2124 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2125
2126 prev = next;
2127 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2128 continue;
2129 }
2130
2131 prev = next;
2132
2133 /* Check that all the accesses have the same STEP. */
2134 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2135 if (tree_int_cst_compare (step, next_step))
2136 {
2137 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139 "not consecutive access in interleaving");
2140 return false;
2141 }
2142
2143 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2144 /* Check that the distance between two accesses is equal to the type
2145 size. Otherwise, we have gaps. */
2146 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2147 - TREE_INT_CST_LOW (prev_init)) / type_size;
2148 if (diff != 1)
2149 {
2150 /* FORNOW: SLP of accesses with gaps is not supported. */
2151 slp_impossible = true;
2152 if (DR_IS_WRITE (data_ref))
2153 {
2154 if (dump_enabled_p ())
2155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156 "interleaved store with gaps");
2157 return false;
2158 }
2159
2160 gaps += diff - 1;
2161 }
2162
2163 last_accessed_element += diff;
2164
2165 /* Store the gap from the previous member of the group. If there is no
2166 gap in the access, GROUP_GAP is always 1. */
2167 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2168
2169 prev_init = DR_INIT (data_ref);
2170 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2171 /* Count the number of data-refs in the chain. */
2172 count++;
2173 }
2174
2175 /* COUNT is the number of accesses found, we multiply it by the size of
2176 the type to get COUNT_IN_BYTES. */
2177 count_in_bytes = type_size * count;
2178
2179 /* Check that the size of the interleaving (including gaps) is not
2180 greater than STEP. */
2181 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2182 {
2183 if (dump_enabled_p ())
2184 {
2185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186 "interleaving size is greater than step for ");
2187 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2188 }
2189 return false;
2190 }
2191
2192 /* Check that the size of the interleaving is equal to STEP for stores,
2193 i.e., that there are no gaps. */
2194 if (dr_step && dr_step != count_in_bytes)
2195 {
2196 if (DR_IS_READ (dr))
2197 {
2198 slp_impossible = true;
2199 /* There is a gap after the last load in the group. This gap is a
2200 difference between the groupsize and the number of elements.
2201 When there is no gap, this difference should be 0. */
2202 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2203 }
2204 else
2205 {
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208 "interleaved store with gaps");
2209 return false;
2210 }
2211 }
2212
2213 /* Check that STEP is a multiple of type size. */
2214 if (dr_step && (dr_step % type_size) != 0)
2215 {
2216 if (dump_enabled_p ())
2217 {
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "step is not a multiple of type size: step ");
2220 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2221 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2222 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2223 TYPE_SIZE_UNIT (scalar_type));
2224 }
2225 return false;
2226 }
2227
2228 if (groupsize == 0)
2229 groupsize = count;
2230
2231 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "Detected interleaving of size %d", (int)groupsize);
2235
2236 /* SLP: create an SLP data structure for every interleaving group of
2237 stores for further analysis in vect_analyse_slp. */
2238 if (DR_IS_WRITE (dr) && !slp_impossible)
2239 {
2240 if (loop_vinfo)
2241 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2242 if (bb_vinfo)
2243 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2244 }
2245
2246 /* There is a gap in the end of the group. */
2247 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2248 {
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251 "Data access with gaps requires scalar "
2252 "epilogue loop");
2253 if (loop->inner)
2254 {
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 "Peeling for outer loop is not supported");
2258 return false;
2259 }
2260
2261 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2262 }
2263 }
2264
2265 return true;
2266 }
2267
2268
2269 /* Analyze the access pattern of the data-reference DR.
2270 In case of non-consecutive accesses call vect_analyze_group_access() to
2271 analyze groups of accesses. */
2272
2273 static bool
2274 vect_analyze_data_ref_access (struct data_reference *dr)
2275 {
2276 tree step = DR_STEP (dr);
2277 tree scalar_type = TREE_TYPE (DR_REF (dr));
2278 gimple stmt = DR_STMT (dr);
2279 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2280 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2281 struct loop *loop = NULL;
2282
2283 if (loop_vinfo)
2284 loop = LOOP_VINFO_LOOP (loop_vinfo);
2285
2286 if (loop_vinfo && !step)
2287 {
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290 "bad data-ref access in loop");
2291 return false;
2292 }
2293
2294 /* Allow invariant loads in loops. */
2295 if (loop_vinfo && integer_zerop (step))
2296 {
2297 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2298 return DR_IS_READ (dr);
2299 }
2300
2301 if (loop && nested_in_vect_loop_p (loop, stmt))
2302 {
2303 /* Interleaved accesses are not yet supported within outer-loop
2304 vectorization for references in the inner-loop. */
2305 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2306
2307 /* For the rest of the analysis we use the outer-loop step. */
2308 step = STMT_VINFO_DR_STEP (stmt_info);
2309 if (integer_zerop (step))
2310 {
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "zero step in outer loop.");
2314 if (DR_IS_READ (dr))
2315 return true;
2316 else
2317 return false;
2318 }
2319 }
2320
2321 /* Consecutive? */
2322 if (TREE_CODE (step) == INTEGER_CST)
2323 {
2324 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2325 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2326 || (dr_step < 0
2327 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2328 {
2329 /* Mark that it is not interleaving. */
2330 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2331 return true;
2332 }
2333 }
2334
2335 if (loop && nested_in_vect_loop_p (loop, stmt))
2336 {
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_NOTE, vect_location,
2339 "grouped access in outer loop.");
2340 return false;
2341 }
2342
2343 /* Assume this is a DR handled by non-constant strided load case. */
2344 if (TREE_CODE (step) != INTEGER_CST)
2345 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2346
2347 /* Not consecutive access - check if it's a part of interleaving group. */
2348 return vect_analyze_group_access (dr);
2349 }
2350
2351 /* Compare two data-references DRA and DRB to group them into chunks
2352 suitable for grouping. */
2353
2354 static int
2355 dr_group_sort_cmp (const void *dra_, const void *drb_)
2356 {
2357 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2358 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2359 hashval_t h1, h2;
2360 int cmp;
2361
2362 /* Stabilize sort. */
2363 if (dra == drb)
2364 return 0;
2365
2366 /* Ordering of DRs according to base. */
2367 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2368 {
2369 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2370 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2371 if (h1 != h2)
2372 return h1 < h2 ? -1 : 1;
2373 }
2374
2375 /* And according to DR_OFFSET. */
2376 if (!dr_equal_offsets_p (dra, drb))
2377 {
2378 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2379 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2380 if (h1 != h2)
2381 return h1 < h2 ? -1 : 1;
2382 }
2383
2384 /* Put reads before writes. */
2385 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2386 return DR_IS_READ (dra) ? -1 : 1;
2387
2388 /* Then sort after access size. */
2389 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2390 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2391 {
2392 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2393 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2394 if (h1 != h2)
2395 return h1 < h2 ? -1 : 1;
2396 }
2397
2398 /* And after step. */
2399 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2400 {
2401 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2402 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2403 if (h1 != h2)
2404 return h1 < h2 ? -1 : 1;
2405 }
2406
2407 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2408 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2409 if (cmp == 0)
2410 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2411 return cmp;
2412 }
2413
2414 /* Function vect_analyze_data_ref_accesses.
2415
2416 Analyze the access pattern of all the data references in the loop.
2417
2418 FORNOW: the only access pattern that is considered vectorizable is a
2419 simple step 1 (consecutive) access.
2420
2421 FORNOW: handle only arrays and pointer accesses. */
2422
2423 bool
2424 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2425 {
2426 unsigned int i;
2427 vec<data_reference_p> datarefs;
2428 struct data_reference *dr;
2429
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "=== vect_analyze_data_ref_accesses ===");
2433
2434 if (loop_vinfo)
2435 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2436 else
2437 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2438
2439 if (datarefs.is_empty ())
2440 return true;
2441
2442 /* Sort the array of datarefs to make building the interleaving chains
2443 linear. */
2444 qsort (datarefs.address(), datarefs.length (),
2445 sizeof (data_reference_p), dr_group_sort_cmp);
2446
2447 /* Build the interleaving chains. */
2448 for (i = 0; i < datarefs.length () - 1;)
2449 {
2450 data_reference_p dra = datarefs[i];
2451 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2452 stmt_vec_info lastinfo = NULL;
2453 for (i = i + 1; i < datarefs.length (); ++i)
2454 {
2455 data_reference_p drb = datarefs[i];
2456 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2457
2458 /* ??? Imperfect sorting (non-compatible types, non-modulo
2459 accesses, same accesses) can lead to a group to be artificially
2460 split here as we don't just skip over those. If it really
2461 matters we can push those to a worklist and re-iterate
2462 over them. The we can just skip ahead to the next DR here. */
2463
2464 /* Check that the data-refs have same first location (except init)
2465 and they are both either store or load (not load and store). */
2466 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2467 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2468 DR_BASE_ADDRESS (drb), 0)
2469 || !dr_equal_offsets_p (dra, drb))
2470 break;
2471
2472 /* Check that the data-refs have the same constant size and step. */
2473 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2474 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2475 if (!host_integerp (sza, 1)
2476 || !host_integerp (szb, 1)
2477 || !tree_int_cst_equal (sza, szb)
2478 || !host_integerp (DR_STEP (dra), 0)
2479 || !host_integerp (DR_STEP (drb), 0)
2480 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2481 break;
2482
2483 /* Do not place the same access in the interleaving chain twice. */
2484 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2485 break;
2486
2487 /* Check the types are compatible.
2488 ??? We don't distinguish this during sorting. */
2489 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2490 TREE_TYPE (DR_REF (drb))))
2491 break;
2492
2493 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2494 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2495 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2496 gcc_assert (init_a < init_b);
2497
2498 /* If init_b == init_a + the size of the type * k, we have an
2499 interleaving, and DRA is accessed before DRB. */
2500 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2501 if ((init_b - init_a) % type_size_a != 0)
2502 break;
2503
2504 /* The step (if not zero) is greater than the difference between
2505 data-refs' inits. This splits groups into suitable sizes. */
2506 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2507 if (step != 0 && step <= (init_b - init_a))
2508 break;
2509
2510 if (dump_enabled_p ())
2511 {
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "Detected interleaving ");
2514 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2515 dump_printf (MSG_NOTE, " and ");
2516 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2517 }
2518
2519 /* Link the found element into the group list. */
2520 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2521 {
2522 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2523 lastinfo = stmtinfo_a;
2524 }
2525 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2526 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2527 lastinfo = stmtinfo_b;
2528 }
2529 }
2530
2531 FOR_EACH_VEC_ELT (datarefs, i, dr)
2532 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2533 && !vect_analyze_data_ref_access (dr))
2534 {
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537 "not vectorized: complicated access pattern.");
2538
2539 if (bb_vinfo)
2540 {
2541 /* Mark the statement as not vectorizable. */
2542 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2543 continue;
2544 }
2545 else
2546 return false;
2547 }
2548
2549 return true;
2550 }
2551
2552 /* Function vect_prune_runtime_alias_test_list.
2553
2554 Prune a list of ddrs to be tested at run-time by versioning for alias.
2555 Return FALSE if resulting list of ddrs is longer then allowed by
2556 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2557
2558 bool
2559 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2560 {
2561 vec<ddr_p> ddrs =
2562 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2563 unsigned i, j;
2564
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_NOTE, vect_location,
2567 "=== vect_prune_runtime_alias_test_list ===");
2568
2569 for (i = 0; i < ddrs.length (); )
2570 {
2571 bool found;
2572 ddr_p ddr_i;
2573
2574 ddr_i = ddrs[i];
2575 found = false;
2576
2577 for (j = 0; j < i; j++)
2578 {
2579 ddr_p ddr_j = ddrs[j];
2580
2581 if (vect_vfa_range_equal (ddr_i, ddr_j))
2582 {
2583 if (dump_enabled_p ())
2584 {
2585 dump_printf_loc (MSG_NOTE, vect_location,
2586 "found equal ranges ");
2587 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2588 dump_printf (MSG_NOTE, ", ");
2589 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2590 dump_printf (MSG_NOTE, " and ");
2591 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2592 dump_printf (MSG_NOTE, ", ");
2593 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2594 }
2595 found = true;
2596 break;
2597 }
2598 }
2599
2600 if (found)
2601 {
2602 ddrs.ordered_remove (i);
2603 continue;
2604 }
2605 i++;
2606 }
2607
2608 if (ddrs.length () >
2609 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2610 {
2611 if (dump_enabled_p ())
2612 {
2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 "disable versioning for alias - max number of "
2615 "generated checks exceeded.");
2616 }
2617
2618 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2619
2620 return false;
2621 }
2622
2623 return true;
2624 }
2625
2626 /* Check whether a non-affine read in stmt is suitable for gather load
2627 and if so, return a builtin decl for that operation. */
2628
2629 tree
2630 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2631 tree *offp, int *scalep)
2632 {
2633 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2635 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2636 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2637 tree offtype = NULL_TREE;
2638 tree decl, base, off;
2639 enum machine_mode pmode;
2640 int punsignedp, pvolatilep;
2641
2642 /* The gather builtins need address of the form
2643 loop_invariant + vector * {1, 2, 4, 8}
2644 or
2645 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2646 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2647 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2648 multiplications and additions in it. To get a vector, we need
2649 a single SSA_NAME that will be defined in the loop and will
2650 contain everything that is not loop invariant and that can be
2651 vectorized. The following code attempts to find such a preexistng
2652 SSA_NAME OFF and put the loop invariants into a tree BASE
2653 that can be gimplified before the loop. */
2654 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2655 &pmode, &punsignedp, &pvolatilep, false);
2656 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2657
2658 if (TREE_CODE (base) == MEM_REF)
2659 {
2660 if (!integer_zerop (TREE_OPERAND (base, 1)))
2661 {
2662 if (off == NULL_TREE)
2663 {
2664 double_int moff = mem_ref_offset (base);
2665 off = double_int_to_tree (sizetype, moff);
2666 }
2667 else
2668 off = size_binop (PLUS_EXPR, off,
2669 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2670 }
2671 base = TREE_OPERAND (base, 0);
2672 }
2673 else
2674 base = build_fold_addr_expr (base);
2675
2676 if (off == NULL_TREE)
2677 off = size_zero_node;
2678
2679 /* If base is not loop invariant, either off is 0, then we start with just
2680 the constant offset in the loop invariant BASE and continue with base
2681 as OFF, otherwise give up.
2682 We could handle that case by gimplifying the addition of base + off
2683 into some SSA_NAME and use that as off, but for now punt. */
2684 if (!expr_invariant_in_loop_p (loop, base))
2685 {
2686 if (!integer_zerop (off))
2687 return NULL_TREE;
2688 off = base;
2689 base = size_int (pbitpos / BITS_PER_UNIT);
2690 }
2691 /* Otherwise put base + constant offset into the loop invariant BASE
2692 and continue with OFF. */
2693 else
2694 {
2695 base = fold_convert (sizetype, base);
2696 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2697 }
2698
2699 /* OFF at this point may be either a SSA_NAME or some tree expression
2700 from get_inner_reference. Try to peel off loop invariants from it
2701 into BASE as long as possible. */
2702 STRIP_NOPS (off);
2703 while (offtype == NULL_TREE)
2704 {
2705 enum tree_code code;
2706 tree op0, op1, add = NULL_TREE;
2707
2708 if (TREE_CODE (off) == SSA_NAME)
2709 {
2710 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2711
2712 if (expr_invariant_in_loop_p (loop, off))
2713 return NULL_TREE;
2714
2715 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2716 break;
2717
2718 op0 = gimple_assign_rhs1 (def_stmt);
2719 code = gimple_assign_rhs_code (def_stmt);
2720 op1 = gimple_assign_rhs2 (def_stmt);
2721 }
2722 else
2723 {
2724 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2725 return NULL_TREE;
2726 code = TREE_CODE (off);
2727 extract_ops_from_tree (off, &code, &op0, &op1);
2728 }
2729 switch (code)
2730 {
2731 case POINTER_PLUS_EXPR:
2732 case PLUS_EXPR:
2733 if (expr_invariant_in_loop_p (loop, op0))
2734 {
2735 add = op0;
2736 off = op1;
2737 do_add:
2738 add = fold_convert (sizetype, add);
2739 if (scale != 1)
2740 add = size_binop (MULT_EXPR, add, size_int (scale));
2741 base = size_binop (PLUS_EXPR, base, add);
2742 continue;
2743 }
2744 if (expr_invariant_in_loop_p (loop, op1))
2745 {
2746 add = op1;
2747 off = op0;
2748 goto do_add;
2749 }
2750 break;
2751 case MINUS_EXPR:
2752 if (expr_invariant_in_loop_p (loop, op1))
2753 {
2754 add = fold_convert (sizetype, op1);
2755 add = size_binop (MINUS_EXPR, size_zero_node, add);
2756 off = op0;
2757 goto do_add;
2758 }
2759 break;
2760 case MULT_EXPR:
2761 if (scale == 1 && host_integerp (op1, 0))
2762 {
2763 scale = tree_low_cst (op1, 0);
2764 off = op0;
2765 continue;
2766 }
2767 break;
2768 case SSA_NAME:
2769 off = op0;
2770 continue;
2771 CASE_CONVERT:
2772 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2773 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2774 break;
2775 if (TYPE_PRECISION (TREE_TYPE (op0))
2776 == TYPE_PRECISION (TREE_TYPE (off)))
2777 {
2778 off = op0;
2779 continue;
2780 }
2781 if (TYPE_PRECISION (TREE_TYPE (op0))
2782 < TYPE_PRECISION (TREE_TYPE (off)))
2783 {
2784 off = op0;
2785 offtype = TREE_TYPE (off);
2786 STRIP_NOPS (off);
2787 continue;
2788 }
2789 break;
2790 default:
2791 break;
2792 }
2793 break;
2794 }
2795
2796 /* If at the end OFF still isn't a SSA_NAME or isn't
2797 defined in the loop, punt. */
2798 if (TREE_CODE (off) != SSA_NAME
2799 || expr_invariant_in_loop_p (loop, off))
2800 return NULL_TREE;
2801
2802 if (offtype == NULL_TREE)
2803 offtype = TREE_TYPE (off);
2804
2805 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2806 offtype, scale);
2807 if (decl == NULL_TREE)
2808 return NULL_TREE;
2809
2810 if (basep)
2811 *basep = base;
2812 if (offp)
2813 *offp = off;
2814 if (scalep)
2815 *scalep = scale;
2816 return decl;
2817 }
2818
2819 /* Check wether a non-affine load in STMT (being in the loop referred to
2820 in LOOP_VINFO) is suitable for handling as strided load. That is the case
2821 if its address is a simple induction variable. If so return the base
2822 of that induction variable in *BASEP and the (loop-invariant) step
2823 in *STEPP, both only when that pointer is non-zero.
2824
2825 This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
2826 base pointer) only. */
2827
2828 static bool
2829 vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo)
2830 {
2831 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2832 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2833 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2834 tree base, off;
2835 affine_iv iv;
2836
2837 if (!DR_IS_READ (dr))
2838 return false;
2839
2840 base = DR_REF (dr);
2841
2842 if (TREE_CODE (base) == REALPART_EXPR
2843 || TREE_CODE (base) == IMAGPART_EXPR)
2844 base = TREE_OPERAND (base, 0);
2845
2846 if (TREE_CODE (base) == ARRAY_REF)
2847 {
2848 off = TREE_OPERAND (base, 1);
2849 base = TREE_OPERAND (base, 0);
2850 }
2851 else if (TREE_CODE (base) == MEM_REF)
2852 {
2853 off = TREE_OPERAND (base, 0);
2854 base = TREE_OPERAND (base, 1);
2855 }
2856 else
2857 return false;
2858
2859 if (TREE_CODE (off) != SSA_NAME)
2860 return false;
2861
2862 if (!expr_invariant_in_loop_p (loop, base)
2863 || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
2864 return false;
2865
2866 return true;
2867 }
2868
2869 /* Function vect_analyze_data_refs.
2870
2871 Find all the data references in the loop or basic block.
2872
2873 The general structure of the analysis of data refs in the vectorizer is as
2874 follows:
2875 1- vect_analyze_data_refs(loop/bb): call
2876 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2877 in the loop/bb and their dependences.
2878 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2879 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2880 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2881
2882 */
2883
2884 bool
2885 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2886 bb_vec_info bb_vinfo,
2887 int *min_vf)
2888 {
2889 struct loop *loop = NULL;
2890 basic_block bb = NULL;
2891 unsigned int i;
2892 vec<data_reference_p> datarefs;
2893 struct data_reference *dr;
2894 tree scalar_type;
2895
2896 if (dump_enabled_p ())
2897 dump_printf_loc (MSG_NOTE, vect_location,
2898 "=== vect_analyze_data_refs ===\n");
2899
2900 if (loop_vinfo)
2901 {
2902 loop = LOOP_VINFO_LOOP (loop_vinfo);
2903 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2904 || find_data_references_in_loop
2905 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2906 {
2907 if (dump_enabled_p ())
2908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909 "not vectorized: loop contains function calls"
2910 " or data references that cannot be analyzed");
2911 return false;
2912 }
2913
2914 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2915 }
2916 else
2917 {
2918 gimple_stmt_iterator gsi;
2919
2920 bb = BB_VINFO_BB (bb_vinfo);
2921 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2922 {
2923 gimple stmt = gsi_stmt (gsi);
2924 if (!find_data_references_in_stmt (NULL, stmt,
2925 &BB_VINFO_DATAREFS (bb_vinfo)))
2926 {
2927 /* Mark the rest of the basic-block as unvectorizable. */
2928 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2929 {
2930 stmt = gsi_stmt (gsi);
2931 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2932 }
2933 break;
2934 }
2935 }
2936
2937 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2938 }
2939
2940 /* Go through the data-refs, check that the analysis succeeded. Update
2941 pointer from stmt_vec_info struct to DR and vectype. */
2942
2943 FOR_EACH_VEC_ELT (datarefs, i, dr)
2944 {
2945 gimple stmt;
2946 stmt_vec_info stmt_info;
2947 tree base, offset, init;
2948 bool gather = false;
2949 int vf;
2950
2951 if (!dr || !DR_REF (dr))
2952 {
2953 if (dump_enabled_p ())
2954 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2955 "not vectorized: unhandled data-ref ");
2956 return false;
2957 }
2958
2959 stmt = DR_STMT (dr);
2960 stmt_info = vinfo_for_stmt (stmt);
2961
2962 /* Check that analysis of the data-ref succeeded. */
2963 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2964 || !DR_STEP (dr))
2965 {
2966 /* If target supports vector gather loads, see if they can't
2967 be used. */
2968 if (loop_vinfo
2969 && DR_IS_READ (dr)
2970 && !TREE_THIS_VOLATILE (DR_REF (dr))
2971 && targetm.vectorize.builtin_gather != NULL
2972 && !nested_in_vect_loop_p (loop, stmt))
2973 {
2974 struct data_reference *newdr
2975 = create_data_ref (NULL, loop_containing_stmt (stmt),
2976 DR_REF (dr), stmt, true);
2977 gcc_assert (newdr != NULL && DR_REF (newdr));
2978 if (DR_BASE_ADDRESS (newdr)
2979 && DR_OFFSET (newdr)
2980 && DR_INIT (newdr)
2981 && DR_STEP (newdr)
2982 && integer_zerop (DR_STEP (newdr)))
2983 {
2984 dr = newdr;
2985 gather = true;
2986 }
2987 else
2988 free_data_ref (newdr);
2989 }
2990
2991 if (!gather)
2992 {
2993 if (dump_enabled_p ())
2994 {
2995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2996 "not vectorized: data ref analysis "
2997 "failed ");
2998 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2999 }
3000
3001 if (bb_vinfo)
3002 break;
3003
3004 return false;
3005 }
3006 }
3007
3008 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3009 {
3010 if (dump_enabled_p ())
3011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012 "not vectorized: base addr of dr is a "
3013 "constant");
3014
3015 if (bb_vinfo)
3016 break;
3017
3018 if (gather)
3019 free_data_ref (dr);
3020 return false;
3021 }
3022
3023 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3024 {
3025 if (dump_enabled_p ())
3026 {
3027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3028 "not vectorized: volatile type ");
3029 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3030 }
3031
3032 if (bb_vinfo)
3033 break;
3034
3035 return false;
3036 }
3037
3038 if (stmt_can_throw_internal (stmt))
3039 {
3040 if (dump_enabled_p ())
3041 {
3042 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3043 "not vectorized: statement can throw an "
3044 "exception ");
3045 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3046 }
3047
3048 if (bb_vinfo)
3049 break;
3050
3051 if (gather)
3052 free_data_ref (dr);
3053 return false;
3054 }
3055
3056 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3057 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3058 {
3059 if (dump_enabled_p ())
3060 {
3061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3062 "not vectorized: statement is bitfield "
3063 "access ");
3064 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3065 }
3066
3067 if (bb_vinfo)
3068 break;
3069
3070 if (gather)
3071 free_data_ref (dr);
3072 return false;
3073 }
3074
3075 base = unshare_expr (DR_BASE_ADDRESS (dr));
3076 offset = unshare_expr (DR_OFFSET (dr));
3077 init = unshare_expr (DR_INIT (dr));
3078
3079 if (is_gimple_call (stmt))
3080 {
3081 if (dump_enabled_p ())
3082 {
3083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3084 "not vectorized: dr in a call ");
3085 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3086 }
3087
3088 if (bb_vinfo)
3089 break;
3090
3091 if (gather)
3092 free_data_ref (dr);
3093 return false;
3094 }
3095
3096 /* Update DR field in stmt_vec_info struct. */
3097
3098 /* If the dataref is in an inner-loop of the loop that is considered for
3099 for vectorization, we also want to analyze the access relative to
3100 the outer-loop (DR contains information only relative to the
3101 inner-most enclosing loop). We do that by building a reference to the
3102 first location accessed by the inner-loop, and analyze it relative to
3103 the outer-loop. */
3104 if (loop && nested_in_vect_loop_p (loop, stmt))
3105 {
3106 tree outer_step, outer_base, outer_init;
3107 HOST_WIDE_INT pbitsize, pbitpos;
3108 tree poffset;
3109 enum machine_mode pmode;
3110 int punsignedp, pvolatilep;
3111 affine_iv base_iv, offset_iv;
3112 tree dinit;
3113
3114 /* Build a reference to the first location accessed by the
3115 inner-loop: *(BASE+INIT). (The first location is actually
3116 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3117 tree inner_base = build_fold_indirect_ref
3118 (fold_build_pointer_plus (base, init));
3119
3120 if (dump_enabled_p ())
3121 {
3122 dump_printf_loc (MSG_NOTE, vect_location,
3123 "analyze in outer-loop: ");
3124 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3125 }
3126
3127 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3128 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3129 gcc_assert (outer_base != NULL_TREE);
3130
3131 if (pbitpos % BITS_PER_UNIT != 0)
3132 {
3133 if (dump_enabled_p ())
3134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3135 "failed: bit offset alignment.\n");
3136 return false;
3137 }
3138
3139 outer_base = build_fold_addr_expr (outer_base);
3140 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3141 &base_iv, false))
3142 {
3143 if (dump_enabled_p ())
3144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3145 "failed: evolution of base is not affine.\n");
3146 return false;
3147 }
3148
3149 if (offset)
3150 {
3151 if (poffset)
3152 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3153 poffset);
3154 else
3155 poffset = offset;
3156 }
3157
3158 if (!poffset)
3159 {
3160 offset_iv.base = ssize_int (0);
3161 offset_iv.step = ssize_int (0);
3162 }
3163 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3164 &offset_iv, false))
3165 {
3166 if (dump_enabled_p ())
3167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3168 "evolution of offset is not affine.\n");
3169 return false;
3170 }
3171
3172 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3173 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3174 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3175 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3176 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3177
3178 outer_step = size_binop (PLUS_EXPR,
3179 fold_convert (ssizetype, base_iv.step),
3180 fold_convert (ssizetype, offset_iv.step));
3181
3182 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3183 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3184 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3185 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3186 STMT_VINFO_DR_OFFSET (stmt_info) =
3187 fold_convert (ssizetype, offset_iv.base);
3188 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3189 size_int (highest_pow2_factor (offset_iv.base));
3190
3191 if (dump_enabled_p ())
3192 {
3193 dump_printf_loc (MSG_NOTE, vect_location,
3194 "\touter base_address: ");
3195 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3196 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3197 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3198 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3199 STMT_VINFO_DR_OFFSET (stmt_info));
3200 dump_printf (MSG_NOTE,
3201 "\n\touter constant offset from base address: ");
3202 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3203 STMT_VINFO_DR_INIT (stmt_info));
3204 dump_printf (MSG_NOTE, "\n\touter step: ");
3205 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3206 STMT_VINFO_DR_STEP (stmt_info));
3207 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3208 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3209 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3210 }
3211 }
3212
3213 if (STMT_VINFO_DATA_REF (stmt_info))
3214 {
3215 if (dump_enabled_p ())
3216 {
3217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3218 "not vectorized: more than one data ref "
3219 "in stmt: ");
3220 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3221 }
3222
3223 if (bb_vinfo)
3224 break;
3225
3226 if (gather)
3227 free_data_ref (dr);
3228 return false;
3229 }
3230
3231 STMT_VINFO_DATA_REF (stmt_info) = dr;
3232
3233 /* Set vectype for STMT. */
3234 scalar_type = TREE_TYPE (DR_REF (dr));
3235 STMT_VINFO_VECTYPE (stmt_info) =
3236 get_vectype_for_scalar_type (scalar_type);
3237 if (!STMT_VINFO_VECTYPE (stmt_info))
3238 {
3239 if (dump_enabled_p ())
3240 {
3241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3242 "not vectorized: no vectype for stmt: ");
3243 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3244 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3245 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3246 scalar_type);
3247 }
3248
3249 if (bb_vinfo)
3250 break;
3251
3252 if (gather)
3253 {
3254 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3255 free_data_ref (dr);
3256 }
3257 return false;
3258 }
3259
3260 /* Adjust the minimal vectorization factor according to the
3261 vector type. */
3262 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3263 if (vf > *min_vf)
3264 *min_vf = vf;
3265
3266 if (gather)
3267 {
3268 tree off;
3269
3270 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3271 if (gather
3272 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3273 gather = false;
3274 if (!gather)
3275 {
3276 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3277 free_data_ref (dr);
3278 if (dump_enabled_p ())
3279 {
3280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3281 "not vectorized: not suitable for gather "
3282 "load ");
3283 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3284 }
3285 return false;
3286 }
3287
3288 datarefs[i] = dr;
3289 STMT_VINFO_GATHER_P (stmt_info) = true;
3290 }
3291 else if (loop_vinfo
3292 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3293 {
3294 bool strided_load = false;
3295 if (!nested_in_vect_loop_p (loop, stmt))
3296 strided_load = vect_check_strided_load (stmt, loop_vinfo);
3297 if (!strided_load)
3298 {
3299 if (dump_enabled_p ())
3300 {
3301 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3302 "not vectorized: not suitable for strided "
3303 "load ");
3304 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3305 }
3306 return false;
3307 }
3308 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3309 }
3310 }
3311
3312 /* If we stopped analysis at the first dataref we could not analyze
3313 when trying to vectorize a basic-block mark the rest of the datarefs
3314 as not vectorizable and truncate the vector of datarefs. That
3315 avoids spending useless time in analyzing their dependence. */
3316 if (i != datarefs.length ())
3317 {
3318 gcc_assert (bb_vinfo != NULL);
3319 for (unsigned j = i; j < datarefs.length (); ++j)
3320 {
3321 data_reference_p dr = datarefs[j];
3322 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3323 free_data_ref (dr);
3324 }
3325 datarefs.truncate (i);
3326 }
3327
3328 return true;
3329 }
3330
3331
3332 /* Function vect_get_new_vect_var.
3333
3334 Returns a name for a new variable. The current naming scheme appends the
3335 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3336 the name of vectorizer generated variables, and appends that to NAME if
3337 provided. */
3338
3339 tree
3340 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3341 {
3342 const char *prefix;
3343 tree new_vect_var;
3344
3345 switch (var_kind)
3346 {
3347 case vect_simple_var:
3348 prefix = "vect_";
3349 break;
3350 case vect_scalar_var:
3351 prefix = "stmp_";
3352 break;
3353 case vect_pointer_var:
3354 prefix = "vect_p";
3355 break;
3356 default:
3357 gcc_unreachable ();
3358 }
3359
3360 if (name)
3361 {
3362 char* tmp = concat (prefix, name, NULL);
3363 new_vect_var = create_tmp_reg (type, tmp);
3364 free (tmp);
3365 }
3366 else
3367 new_vect_var = create_tmp_reg (type, prefix);
3368
3369 return new_vect_var;
3370 }
3371
3372
3373 /* Function vect_create_addr_base_for_vector_ref.
3374
3375 Create an expression that computes the address of the first memory location
3376 that will be accessed for a data reference.
3377
3378 Input:
3379 STMT: The statement containing the data reference.
3380 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3381 OFFSET: Optional. If supplied, it is be added to the initial address.
3382 LOOP: Specify relative to which loop-nest should the address be computed.
3383 For example, when the dataref is in an inner-loop nested in an
3384 outer-loop that is now being vectorized, LOOP can be either the
3385 outer-loop, or the inner-loop. The first memory location accessed
3386 by the following dataref ('in' points to short):
3387
3388 for (i=0; i<N; i++)
3389 for (j=0; j<M; j++)
3390 s += in[i+j]
3391
3392 is as follows:
3393 if LOOP=i_loop: &in (relative to i_loop)
3394 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3395
3396 Output:
3397 1. Return an SSA_NAME whose value is the address of the memory location of
3398 the first vector of the data reference.
3399 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3400 these statement(s) which define the returned SSA_NAME.
3401
3402 FORNOW: We are only handling array accesses with step 1. */
3403
3404 tree
3405 vect_create_addr_base_for_vector_ref (gimple stmt,
3406 gimple_seq *new_stmt_list,
3407 tree offset,
3408 struct loop *loop)
3409 {
3410 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3411 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3412 tree data_ref_base;
3413 const char *base_name;
3414 tree addr_base;
3415 tree dest;
3416 gimple_seq seq = NULL;
3417 tree base_offset;
3418 tree init;
3419 tree vect_ptr_type;
3420 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3421 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3422
3423 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3424 {
3425 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3426
3427 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3428
3429 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3430 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3431 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3432 }
3433 else
3434 {
3435 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3436 base_offset = unshare_expr (DR_OFFSET (dr));
3437 init = unshare_expr (DR_INIT (dr));
3438 }
3439
3440 if (loop_vinfo)
3441 base_name = get_name (data_ref_base);
3442 else
3443 {
3444 base_offset = ssize_int (0);
3445 init = ssize_int (0);
3446 base_name = get_name (DR_REF (dr));
3447 }
3448
3449 /* Create base_offset */
3450 base_offset = size_binop (PLUS_EXPR,
3451 fold_convert (sizetype, base_offset),
3452 fold_convert (sizetype, init));
3453
3454 if (offset)
3455 {
3456 offset = fold_build2 (MULT_EXPR, sizetype,
3457 fold_convert (sizetype, offset), step);
3458 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3459 base_offset, offset);
3460 }
3461
3462 /* base + base_offset */
3463 if (loop_vinfo)
3464 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3465 else
3466 {
3467 addr_base = build1 (ADDR_EXPR,
3468 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3469 unshare_expr (DR_REF (dr)));
3470 }
3471
3472 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3473 addr_base = fold_convert (vect_ptr_type, addr_base);
3474 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3475 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3476 gimple_seq_add_seq (new_stmt_list, seq);
3477
3478 if (DR_PTR_INFO (dr)
3479 && TREE_CODE (addr_base) == SSA_NAME)
3480 {
3481 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3482 if (offset)
3483 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3484 }
3485
3486 if (dump_enabled_p ())
3487 {
3488 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3489 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3490 }
3491
3492 return addr_base;
3493 }
3494
3495
3496 /* Function vect_create_data_ref_ptr.
3497
3498 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3499 location accessed in the loop by STMT, along with the def-use update
3500 chain to appropriately advance the pointer through the loop iterations.
3501 Also set aliasing information for the pointer. This pointer is used by
3502 the callers to this function to create a memory reference expression for
3503 vector load/store access.
3504
3505 Input:
3506 1. STMT: a stmt that references memory. Expected to be of the form
3507 GIMPLE_ASSIGN <name, data-ref> or
3508 GIMPLE_ASSIGN <data-ref, name>.
3509 2. AGGR_TYPE: the type of the reference, which should be either a vector
3510 or an array.
3511 3. AT_LOOP: the loop where the vector memref is to be created.
3512 4. OFFSET (optional): an offset to be added to the initial address accessed
3513 by the data-ref in STMT.
3514 5. BSI: location where the new stmts are to be placed if there is no loop
3515 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3516 pointing to the initial address.
3517
3518 Output:
3519 1. Declare a new ptr to vector_type, and have it point to the base of the
3520 data reference (initial addressed accessed by the data reference).
3521 For example, for vector of type V8HI, the following code is generated:
3522
3523 v8hi *ap;
3524 ap = (v8hi *)initial_address;
3525
3526 if OFFSET is not supplied:
3527 initial_address = &a[init];
3528 if OFFSET is supplied:
3529 initial_address = &a[init + OFFSET];
3530
3531 Return the initial_address in INITIAL_ADDRESS.
3532
3533 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3534 update the pointer in each iteration of the loop.
3535
3536 Return the increment stmt that updates the pointer in PTR_INCR.
3537
3538 3. Set INV_P to true if the access pattern of the data reference in the
3539 vectorized loop is invariant. Set it to false otherwise.
3540
3541 4. Return the pointer. */
3542
3543 tree
3544 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3545 tree offset, tree *initial_address,
3546 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3547 bool only_init, bool *inv_p)
3548 {
3549 const char *base_name;
3550 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3551 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3552 struct loop *loop = NULL;
3553 bool nested_in_vect_loop = false;
3554 struct loop *containing_loop = NULL;
3555 tree aggr_ptr_type;
3556 tree aggr_ptr;
3557 tree new_temp;
3558 gimple vec_stmt;
3559 gimple_seq new_stmt_list = NULL;
3560 edge pe = NULL;
3561 basic_block new_bb;
3562 tree aggr_ptr_init;
3563 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3564 tree aptr;
3565 gimple_stmt_iterator incr_gsi;
3566 bool insert_after;
3567 bool negative;
3568 tree indx_before_incr, indx_after_incr;
3569 gimple incr;
3570 tree step;
3571 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3572
3573 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3574 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3575
3576 if (loop_vinfo)
3577 {
3578 loop = LOOP_VINFO_LOOP (loop_vinfo);
3579 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3580 containing_loop = (gimple_bb (stmt))->loop_father;
3581 pe = loop_preheader_edge (loop);
3582 }
3583 else
3584 {
3585 gcc_assert (bb_vinfo);
3586 only_init = true;
3587 *ptr_incr = NULL;
3588 }
3589
3590 /* Check the step (evolution) of the load in LOOP, and record
3591 whether it's invariant. */
3592 if (nested_in_vect_loop)
3593 step = STMT_VINFO_DR_STEP (stmt_info);
3594 else
3595 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3596
3597 if (tree_int_cst_compare (step, size_zero_node) == 0)
3598 *inv_p = true;
3599 else
3600 *inv_p = false;
3601 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3602
3603 /* Create an expression for the first address accessed by this load
3604 in LOOP. */
3605 base_name = get_name (DR_BASE_ADDRESS (dr));
3606
3607 if (dump_enabled_p ())
3608 {
3609 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3610 dump_printf_loc (MSG_NOTE, vect_location,
3611 "create %s-pointer variable to type: ",
3612 tree_code_name[(int) TREE_CODE (aggr_type)]);
3613 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3614 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3615 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3616 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3617 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3618 else
3619 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3620 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3621 }
3622
3623 /* (1) Create the new aggregate-pointer variable.
3624 Vector and array types inherit the alias set of their component
3625 type by default so we need to use a ref-all pointer if the data
3626 reference does not conflict with the created aggregated data
3627 reference because it is not addressable. */
3628 bool need_ref_all = false;
3629 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3630 get_alias_set (DR_REF (dr))))
3631 need_ref_all = true;
3632 /* Likewise for any of the data references in the stmt group. */
3633 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3634 {
3635 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3636 do
3637 {
3638 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3639 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3640 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3641 get_alias_set (DR_REF (sdr))))
3642 {
3643 need_ref_all = true;
3644 break;
3645 }
3646 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3647 }
3648 while (orig_stmt);
3649 }
3650 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3651 need_ref_all);
3652 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3653
3654
3655 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3656 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3657 def-use update cycles for the pointer: one relative to the outer-loop
3658 (LOOP), which is what steps (3) and (4) below do. The other is relative
3659 to the inner-loop (which is the inner-most loop containing the dataref),
3660 and this is done be step (5) below.
3661
3662 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3663 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3664 redundant. Steps (3),(4) create the following:
3665
3666 vp0 = &base_addr;
3667 LOOP: vp1 = phi(vp0,vp2)
3668 ...
3669 ...
3670 vp2 = vp1 + step
3671 goto LOOP
3672
3673 If there is an inner-loop nested in loop, then step (5) will also be
3674 applied, and an additional update in the inner-loop will be created:
3675
3676 vp0 = &base_addr;
3677 LOOP: vp1 = phi(vp0,vp2)
3678 ...
3679 inner: vp3 = phi(vp1,vp4)
3680 vp4 = vp3 + inner_step
3681 if () goto inner
3682 ...
3683 vp2 = vp1 + step
3684 if () goto LOOP */
3685
3686 /* (2) Calculate the initial address of the aggregate-pointer, and set
3687 the aggregate-pointer to point to it before the loop. */
3688
3689 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3690
3691 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3692 offset, loop);
3693 if (new_stmt_list)
3694 {
3695 if (pe)
3696 {
3697 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3698 gcc_assert (!new_bb);
3699 }
3700 else
3701 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3702 }
3703
3704 *initial_address = new_temp;
3705
3706 /* Create: p = (aggr_type *) initial_base */
3707 if (TREE_CODE (new_temp) != SSA_NAME
3708 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3709 {
3710 vec_stmt = gimple_build_assign (aggr_ptr,
3711 fold_convert (aggr_ptr_type, new_temp));
3712 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3713 /* Copy the points-to information if it exists. */
3714 if (DR_PTR_INFO (dr))
3715 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3716 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3717 if (pe)
3718 {
3719 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3720 gcc_assert (!new_bb);
3721 }
3722 else
3723 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3724 }
3725 else
3726 aggr_ptr_init = new_temp;
3727
3728 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3729 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3730 inner-loop nested in LOOP (during outer-loop vectorization). */
3731
3732 /* No update in loop is required. */
3733 if (only_init && (!loop_vinfo || at_loop == loop))
3734 aptr = aggr_ptr_init;
3735 else
3736 {
3737 /* The step of the aggregate pointer is the type size. */
3738 tree step = TYPE_SIZE_UNIT (aggr_type);
3739 /* One exception to the above is when the scalar step of the load in
3740 LOOP is zero. In this case the step here is also zero. */
3741 if (*inv_p)
3742 step = size_zero_node;
3743 else if (negative)
3744 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3745
3746 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3747
3748 create_iv (aggr_ptr_init,
3749 fold_convert (aggr_ptr_type, step),
3750 aggr_ptr, loop, &incr_gsi, insert_after,
3751 &indx_before_incr, &indx_after_incr);
3752 incr = gsi_stmt (incr_gsi);
3753 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3754
3755 /* Copy the points-to information if it exists. */
3756 if (DR_PTR_INFO (dr))
3757 {
3758 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3759 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3760 }
3761 if (ptr_incr)
3762 *ptr_incr = incr;
3763
3764 aptr = indx_before_incr;
3765 }
3766
3767 if (!nested_in_vect_loop || only_init)
3768 return aptr;
3769
3770
3771 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3772 nested in LOOP, if exists. */
3773
3774 gcc_assert (nested_in_vect_loop);
3775 if (!only_init)
3776 {
3777 standard_iv_increment_position (containing_loop, &incr_gsi,
3778 &insert_after);
3779 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3780 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3781 &indx_after_incr);
3782 incr = gsi_stmt (incr_gsi);
3783 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3784
3785 /* Copy the points-to information if it exists. */
3786 if (DR_PTR_INFO (dr))
3787 {
3788 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3789 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3790 }
3791 if (ptr_incr)
3792 *ptr_incr = incr;
3793
3794 return indx_before_incr;
3795 }
3796 else
3797 gcc_unreachable ();
3798 }
3799
3800
3801 /* Function bump_vector_ptr
3802
3803 Increment a pointer (to a vector type) by vector-size. If requested,
3804 i.e. if PTR-INCR is given, then also connect the new increment stmt
3805 to the existing def-use update-chain of the pointer, by modifying
3806 the PTR_INCR as illustrated below:
3807
3808 The pointer def-use update-chain before this function:
3809 DATAREF_PTR = phi (p_0, p_2)
3810 ....
3811 PTR_INCR: p_2 = DATAREF_PTR + step
3812
3813 The pointer def-use update-chain after this function:
3814 DATAREF_PTR = phi (p_0, p_2)
3815 ....
3816 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3817 ....
3818 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3819
3820 Input:
3821 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3822 in the loop.
3823 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3824 the loop. The increment amount across iterations is expected
3825 to be vector_size.
3826 BSI - location where the new update stmt is to be placed.
3827 STMT - the original scalar memory-access stmt that is being vectorized.
3828 BUMP - optional. The offset by which to bump the pointer. If not given,
3829 the offset is assumed to be vector_size.
3830
3831 Output: Return NEW_DATAREF_PTR as illustrated above.
3832
3833 */
3834
3835 tree
3836 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3837 gimple stmt, tree bump)
3838 {
3839 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3840 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3841 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3842 tree update = TYPE_SIZE_UNIT (vectype);
3843 gimple incr_stmt;
3844 ssa_op_iter iter;
3845 use_operand_p use_p;
3846 tree new_dataref_ptr;
3847
3848 if (bump)
3849 update = bump;
3850
3851 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3852 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3853 dataref_ptr, update);
3854 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3855
3856 /* Copy the points-to information if it exists. */
3857 if (DR_PTR_INFO (dr))
3858 {
3859 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3860 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3861 }
3862
3863 if (!ptr_incr)
3864 return new_dataref_ptr;
3865
3866 /* Update the vector-pointer's cross-iteration increment. */
3867 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3868 {
3869 tree use = USE_FROM_PTR (use_p);
3870
3871 if (use == dataref_ptr)
3872 SET_USE (use_p, new_dataref_ptr);
3873 else
3874 gcc_assert (tree_int_cst_compare (use, update) == 0);
3875 }
3876
3877 return new_dataref_ptr;
3878 }
3879
3880
3881 /* Function vect_create_destination_var.
3882
3883 Create a new temporary of type VECTYPE. */
3884
3885 tree
3886 vect_create_destination_var (tree scalar_dest, tree vectype)
3887 {
3888 tree vec_dest;
3889 const char *new_name;
3890 tree type;
3891 enum vect_var_kind kind;
3892
3893 kind = vectype ? vect_simple_var : vect_scalar_var;
3894 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3895
3896 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3897
3898 new_name = get_name (scalar_dest);
3899 if (!new_name)
3900 new_name = "var_";
3901 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3902
3903 return vec_dest;
3904 }
3905
3906 /* Function vect_grouped_store_supported.
3907
3908 Returns TRUE if interleave high and interleave low permutations
3909 are supported, and FALSE otherwise. */
3910
3911 bool
3912 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3913 {
3914 enum machine_mode mode = TYPE_MODE (vectype);
3915
3916 /* vect_permute_store_chain requires the group size to be a power of two. */
3917 if (exact_log2 (count) == -1)
3918 {
3919 if (dump_enabled_p ())
3920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3921 "the size of the group of accesses"
3922 " is not a power of 2");
3923 return false;
3924 }
3925
3926 /* Check that the permutation is supported. */
3927 if (VECTOR_MODE_P (mode))
3928 {
3929 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3930 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3931 for (i = 0; i < nelt / 2; i++)
3932 {
3933 sel[i * 2] = i;
3934 sel[i * 2 + 1] = i + nelt;
3935 }
3936 if (can_vec_perm_p (mode, false, sel))
3937 {
3938 for (i = 0; i < nelt; i++)
3939 sel[i] += nelt / 2;
3940 if (can_vec_perm_p (mode, false, sel))
3941 return true;
3942 }
3943 }
3944
3945 if (dump_enabled_p ())
3946 dump_printf (MSG_MISSED_OPTIMIZATION,
3947 "interleave op not supported by target.");
3948 return false;
3949 }
3950
3951
3952 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3953 type VECTYPE. */
3954
3955 bool
3956 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3957 {
3958 return vect_lanes_optab_supported_p ("vec_store_lanes",
3959 vec_store_lanes_optab,
3960 vectype, count);
3961 }
3962
3963
3964 /* Function vect_permute_store_chain.
3965
3966 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3967 a power of 2, generate interleave_high/low stmts to reorder the data
3968 correctly for the stores. Return the final references for stores in
3969 RESULT_CHAIN.
3970
3971 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3972 The input is 4 vectors each containing 8 elements. We assign a number to
3973 each element, the input sequence is:
3974
3975 1st vec: 0 1 2 3 4 5 6 7
3976 2nd vec: 8 9 10 11 12 13 14 15
3977 3rd vec: 16 17 18 19 20 21 22 23
3978 4th vec: 24 25 26 27 28 29 30 31
3979
3980 The output sequence should be:
3981
3982 1st vec: 0 8 16 24 1 9 17 25
3983 2nd vec: 2 10 18 26 3 11 19 27
3984 3rd vec: 4 12 20 28 5 13 21 30
3985 4th vec: 6 14 22 30 7 15 23 31
3986
3987 i.e., we interleave the contents of the four vectors in their order.
3988
3989 We use interleave_high/low instructions to create such output. The input of
3990 each interleave_high/low operation is two vectors:
3991 1st vec 2nd vec
3992 0 1 2 3 4 5 6 7
3993 the even elements of the result vector are obtained left-to-right from the
3994 high/low elements of the first vector. The odd elements of the result are
3995 obtained left-to-right from the high/low elements of the second vector.
3996 The output of interleave_high will be: 0 4 1 5
3997 and of interleave_low: 2 6 3 7
3998
3999
4000 The permutation is done in log LENGTH stages. In each stage interleave_high
4001 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4002 where the first argument is taken from the first half of DR_CHAIN and the
4003 second argument from it's second half.
4004 In our example,
4005
4006 I1: interleave_high (1st vec, 3rd vec)
4007 I2: interleave_low (1st vec, 3rd vec)
4008 I3: interleave_high (2nd vec, 4th vec)
4009 I4: interleave_low (2nd vec, 4th vec)
4010
4011 The output for the first stage is:
4012
4013 I1: 0 16 1 17 2 18 3 19
4014 I2: 4 20 5 21 6 22 7 23
4015 I3: 8 24 9 25 10 26 11 27
4016 I4: 12 28 13 29 14 30 15 31
4017
4018 The output of the second stage, i.e. the final result is:
4019
4020 I1: 0 8 16 24 1 9 17 25
4021 I2: 2 10 18 26 3 11 19 27
4022 I3: 4 12 20 28 5 13 21 30
4023 I4: 6 14 22 30 7 15 23 31. */
4024
4025 void
4026 vect_permute_store_chain (vec<tree> dr_chain,
4027 unsigned int length,
4028 gimple stmt,
4029 gimple_stmt_iterator *gsi,
4030 vec<tree> *result_chain)
4031 {
4032 tree vect1, vect2, high, low;
4033 gimple perm_stmt;
4034 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4035 tree perm_mask_low, perm_mask_high;
4036 unsigned int i, n;
4037 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4038 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4039
4040 result_chain->quick_grow (length);
4041 memcpy (result_chain->address (), dr_chain.address (),
4042 length * sizeof (tree));
4043
4044 for (i = 0, n = nelt / 2; i < n; i++)
4045 {
4046 sel[i * 2] = i;
4047 sel[i * 2 + 1] = i + nelt;
4048 }
4049 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4050 gcc_assert (perm_mask_high != NULL);
4051
4052 for (i = 0; i < nelt; i++)
4053 sel[i] += nelt / 2;
4054 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4055 gcc_assert (perm_mask_low != NULL);
4056
4057 for (i = 0, n = exact_log2 (length); i < n; i++)
4058 {
4059 for (j = 0; j < length/2; j++)
4060 {
4061 vect1 = dr_chain[j];
4062 vect2 = dr_chain[j+length/2];
4063
4064 /* Create interleaving stmt:
4065 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4066 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4067 perm_stmt
4068 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4069 vect1, vect2, perm_mask_high);
4070 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4071 (*result_chain)[2*j] = high;
4072
4073 /* Create interleaving stmt:
4074 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4075 nelt*3/2+1, ...}> */
4076 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4077 perm_stmt
4078 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4079 vect1, vect2, perm_mask_low);
4080 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4081 (*result_chain)[2*j+1] = low;
4082 }
4083 memcpy (dr_chain.address (), result_chain->address (),
4084 length * sizeof (tree));
4085 }
4086 }
4087
4088 /* Function vect_setup_realignment
4089
4090 This function is called when vectorizing an unaligned load using
4091 the dr_explicit_realign[_optimized] scheme.
4092 This function generates the following code at the loop prolog:
4093
4094 p = initial_addr;
4095 x msq_init = *(floor(p)); # prolog load
4096 realignment_token = call target_builtin;
4097 loop:
4098 x msq = phi (msq_init, ---)
4099
4100 The stmts marked with x are generated only for the case of
4101 dr_explicit_realign_optimized.
4102
4103 The code above sets up a new (vector) pointer, pointing to the first
4104 location accessed by STMT, and a "floor-aligned" load using that pointer.
4105 It also generates code to compute the "realignment-token" (if the relevant
4106 target hook was defined), and creates a phi-node at the loop-header bb
4107 whose arguments are the result of the prolog-load (created by this
4108 function) and the result of a load that takes place in the loop (to be
4109 created by the caller to this function).
4110
4111 For the case of dr_explicit_realign_optimized:
4112 The caller to this function uses the phi-result (msq) to create the
4113 realignment code inside the loop, and sets up the missing phi argument,
4114 as follows:
4115 loop:
4116 msq = phi (msq_init, lsq)
4117 lsq = *(floor(p')); # load in loop
4118 result = realign_load (msq, lsq, realignment_token);
4119
4120 For the case of dr_explicit_realign:
4121 loop:
4122 msq = *(floor(p)); # load in loop
4123 p' = p + (VS-1);
4124 lsq = *(floor(p')); # load in loop
4125 result = realign_load (msq, lsq, realignment_token);
4126
4127 Input:
4128 STMT - (scalar) load stmt to be vectorized. This load accesses
4129 a memory location that may be unaligned.
4130 BSI - place where new code is to be inserted.
4131 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4132 is used.
4133
4134 Output:
4135 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4136 target hook, if defined.
4137 Return value - the result of the loop-header phi node. */
4138
4139 tree
4140 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4141 tree *realignment_token,
4142 enum dr_alignment_support alignment_support_scheme,
4143 tree init_addr,
4144 struct loop **at_loop)
4145 {
4146 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4147 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4148 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4149 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4150 struct loop *loop = NULL;
4151 edge pe = NULL;
4152 tree scalar_dest = gimple_assign_lhs (stmt);
4153 tree vec_dest;
4154 gimple inc;
4155 tree ptr;
4156 tree data_ref;
4157 gimple new_stmt;
4158 basic_block new_bb;
4159 tree msq_init = NULL_TREE;
4160 tree new_temp;
4161 gimple phi_stmt;
4162 tree msq = NULL_TREE;
4163 gimple_seq stmts = NULL;
4164 bool inv_p;
4165 bool compute_in_loop = false;
4166 bool nested_in_vect_loop = false;
4167 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4168 struct loop *loop_for_initial_load = NULL;
4169
4170 if (loop_vinfo)
4171 {
4172 loop = LOOP_VINFO_LOOP (loop_vinfo);
4173 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4174 }
4175
4176 gcc_assert (alignment_support_scheme == dr_explicit_realign
4177 || alignment_support_scheme == dr_explicit_realign_optimized);
4178
4179 /* We need to generate three things:
4180 1. the misalignment computation
4181 2. the extra vector load (for the optimized realignment scheme).
4182 3. the phi node for the two vectors from which the realignment is
4183 done (for the optimized realignment scheme). */
4184
4185 /* 1. Determine where to generate the misalignment computation.
4186
4187 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4188 calculation will be generated by this function, outside the loop (in the
4189 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4190 caller, inside the loop.
4191
4192 Background: If the misalignment remains fixed throughout the iterations of
4193 the loop, then both realignment schemes are applicable, and also the
4194 misalignment computation can be done outside LOOP. This is because we are
4195 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4196 are a multiple of VS (the Vector Size), and therefore the misalignment in
4197 different vectorized LOOP iterations is always the same.
4198 The problem arises only if the memory access is in an inner-loop nested
4199 inside LOOP, which is now being vectorized using outer-loop vectorization.
4200 This is the only case when the misalignment of the memory access may not
4201 remain fixed throughout the iterations of the inner-loop (as explained in
4202 detail in vect_supportable_dr_alignment). In this case, not only is the
4203 optimized realignment scheme not applicable, but also the misalignment
4204 computation (and generation of the realignment token that is passed to
4205 REALIGN_LOAD) have to be done inside the loop.
4206
4207 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4208 or not, which in turn determines if the misalignment is computed inside
4209 the inner-loop, or outside LOOP. */
4210
4211 if (init_addr != NULL_TREE || !loop_vinfo)
4212 {
4213 compute_in_loop = true;
4214 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4215 }
4216
4217
4218 /* 2. Determine where to generate the extra vector load.
4219
4220 For the optimized realignment scheme, instead of generating two vector
4221 loads in each iteration, we generate a single extra vector load in the
4222 preheader of the loop, and in each iteration reuse the result of the
4223 vector load from the previous iteration. In case the memory access is in
4224 an inner-loop nested inside LOOP, which is now being vectorized using
4225 outer-loop vectorization, we need to determine whether this initial vector
4226 load should be generated at the preheader of the inner-loop, or can be
4227 generated at the preheader of LOOP. If the memory access has no evolution
4228 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4229 to be generated inside LOOP (in the preheader of the inner-loop). */
4230
4231 if (nested_in_vect_loop)
4232 {
4233 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4234 bool invariant_in_outerloop =
4235 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4236 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4237 }
4238 else
4239 loop_for_initial_load = loop;
4240 if (at_loop)
4241 *at_loop = loop_for_initial_load;
4242
4243 if (loop_for_initial_load)
4244 pe = loop_preheader_edge (loop_for_initial_load);
4245
4246 /* 3. For the case of the optimized realignment, create the first vector
4247 load at the loop preheader. */
4248
4249 if (alignment_support_scheme == dr_explicit_realign_optimized)
4250 {
4251 /* Create msq_init = *(floor(p1)) in the loop preheader */
4252
4253 gcc_assert (!compute_in_loop);
4254 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4255 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4256 NULL_TREE, &init_addr, NULL, &inc,
4257 true, &inv_p);
4258 new_temp = copy_ssa_name (ptr, NULL);
4259 new_stmt = gimple_build_assign_with_ops
4260 (BIT_AND_EXPR, new_temp, ptr,
4261 build_int_cst (TREE_TYPE (ptr),
4262 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4263 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4264 gcc_assert (!new_bb);
4265 data_ref
4266 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4267 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4268 new_stmt = gimple_build_assign (vec_dest, data_ref);
4269 new_temp = make_ssa_name (vec_dest, new_stmt);
4270 gimple_assign_set_lhs (new_stmt, new_temp);
4271 if (pe)
4272 {
4273 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4274 gcc_assert (!new_bb);
4275 }
4276 else
4277 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4278
4279 msq_init = gimple_assign_lhs (new_stmt);
4280 }
4281
4282 /* 4. Create realignment token using a target builtin, if available.
4283 It is done either inside the containing loop, or before LOOP (as
4284 determined above). */
4285
4286 if (targetm.vectorize.builtin_mask_for_load)
4287 {
4288 tree builtin_decl;
4289
4290 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4291 if (!init_addr)
4292 {
4293 /* Generate the INIT_ADDR computation outside LOOP. */
4294 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4295 NULL_TREE, loop);
4296 if (loop)
4297 {
4298 pe = loop_preheader_edge (loop);
4299 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4300 gcc_assert (!new_bb);
4301 }
4302 else
4303 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4304 }
4305
4306 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4307 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4308 vec_dest =
4309 vect_create_destination_var (scalar_dest,
4310 gimple_call_return_type (new_stmt));
4311 new_temp = make_ssa_name (vec_dest, new_stmt);
4312 gimple_call_set_lhs (new_stmt, new_temp);
4313
4314 if (compute_in_loop)
4315 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4316 else
4317 {
4318 /* Generate the misalignment computation outside LOOP. */
4319 pe = loop_preheader_edge (loop);
4320 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4321 gcc_assert (!new_bb);
4322 }
4323
4324 *realignment_token = gimple_call_lhs (new_stmt);
4325
4326 /* The result of the CALL_EXPR to this builtin is determined from
4327 the value of the parameter and no global variables are touched
4328 which makes the builtin a "const" function. Requiring the
4329 builtin to have the "const" attribute makes it unnecessary
4330 to call mark_call_clobbered. */
4331 gcc_assert (TREE_READONLY (builtin_decl));
4332 }
4333
4334 if (alignment_support_scheme == dr_explicit_realign)
4335 return msq;
4336
4337 gcc_assert (!compute_in_loop);
4338 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4339
4340
4341 /* 5. Create msq = phi <msq_init, lsq> in loop */
4342
4343 pe = loop_preheader_edge (containing_loop);
4344 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4345 msq = make_ssa_name (vec_dest, NULL);
4346 phi_stmt = create_phi_node (msq, containing_loop->header);
4347 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4348
4349 return msq;
4350 }
4351
4352
4353 /* Function vect_grouped_load_supported.
4354
4355 Returns TRUE if even and odd permutations are supported,
4356 and FALSE otherwise. */
4357
4358 bool
4359 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4360 {
4361 enum machine_mode mode = TYPE_MODE (vectype);
4362
4363 /* vect_permute_load_chain requires the group size to be a power of two. */
4364 if (exact_log2 (count) == -1)
4365 {
4366 if (dump_enabled_p ())
4367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4368 "the size of the group of accesses"
4369 " is not a power of 2");
4370 return false;
4371 }
4372
4373 /* Check that the permutation is supported. */
4374 if (VECTOR_MODE_P (mode))
4375 {
4376 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4377 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4378
4379 for (i = 0; i < nelt; i++)
4380 sel[i] = i * 2;
4381 if (can_vec_perm_p (mode, false, sel))
4382 {
4383 for (i = 0; i < nelt; i++)
4384 sel[i] = i * 2 + 1;
4385 if (can_vec_perm_p (mode, false, sel))
4386 return true;
4387 }
4388 }
4389
4390 if (dump_enabled_p ())
4391 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4392 "extract even/odd not supported by target");
4393 return false;
4394 }
4395
4396 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4397 type VECTYPE. */
4398
4399 bool
4400 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4401 {
4402 return vect_lanes_optab_supported_p ("vec_load_lanes",
4403 vec_load_lanes_optab,
4404 vectype, count);
4405 }
4406
4407 /* Function vect_permute_load_chain.
4408
4409 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4410 a power of 2, generate extract_even/odd stmts to reorder the input data
4411 correctly. Return the final references for loads in RESULT_CHAIN.
4412
4413 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4414 The input is 4 vectors each containing 8 elements. We assign a number to each
4415 element, the input sequence is:
4416
4417 1st vec: 0 1 2 3 4 5 6 7
4418 2nd vec: 8 9 10 11 12 13 14 15
4419 3rd vec: 16 17 18 19 20 21 22 23
4420 4th vec: 24 25 26 27 28 29 30 31
4421
4422 The output sequence should be:
4423
4424 1st vec: 0 4 8 12 16 20 24 28
4425 2nd vec: 1 5 9 13 17 21 25 29
4426 3rd vec: 2 6 10 14 18 22 26 30
4427 4th vec: 3 7 11 15 19 23 27 31
4428
4429 i.e., the first output vector should contain the first elements of each
4430 interleaving group, etc.
4431
4432 We use extract_even/odd instructions to create such output. The input of
4433 each extract_even/odd operation is two vectors
4434 1st vec 2nd vec
4435 0 1 2 3 4 5 6 7
4436
4437 and the output is the vector of extracted even/odd elements. The output of
4438 extract_even will be: 0 2 4 6
4439 and of extract_odd: 1 3 5 7
4440
4441
4442 The permutation is done in log LENGTH stages. In each stage extract_even
4443 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4444 their order. In our example,
4445
4446 E1: extract_even (1st vec, 2nd vec)
4447 E2: extract_odd (1st vec, 2nd vec)
4448 E3: extract_even (3rd vec, 4th vec)
4449 E4: extract_odd (3rd vec, 4th vec)
4450
4451 The output for the first stage will be:
4452
4453 E1: 0 2 4 6 8 10 12 14
4454 E2: 1 3 5 7 9 11 13 15
4455 E3: 16 18 20 22 24 26 28 30
4456 E4: 17 19 21 23 25 27 29 31
4457
4458 In order to proceed and create the correct sequence for the next stage (or
4459 for the correct output, if the second stage is the last one, as in our
4460 example), we first put the output of extract_even operation and then the
4461 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4462 The input for the second stage is:
4463
4464 1st vec (E1): 0 2 4 6 8 10 12 14
4465 2nd vec (E3): 16 18 20 22 24 26 28 30
4466 3rd vec (E2): 1 3 5 7 9 11 13 15
4467 4th vec (E4): 17 19 21 23 25 27 29 31
4468
4469 The output of the second stage:
4470
4471 E1: 0 4 8 12 16 20 24 28
4472 E2: 2 6 10 14 18 22 26 30
4473 E3: 1 5 9 13 17 21 25 29
4474 E4: 3 7 11 15 19 23 27 31
4475
4476 And RESULT_CHAIN after reordering:
4477
4478 1st vec (E1): 0 4 8 12 16 20 24 28
4479 2nd vec (E3): 1 5 9 13 17 21 25 29
4480 3rd vec (E2): 2 6 10 14 18 22 26 30
4481 4th vec (E4): 3 7 11 15 19 23 27 31. */
4482
4483 static void
4484 vect_permute_load_chain (vec<tree> dr_chain,
4485 unsigned int length,
4486 gimple stmt,
4487 gimple_stmt_iterator *gsi,
4488 vec<tree> *result_chain)
4489 {
4490 tree data_ref, first_vect, second_vect;
4491 tree perm_mask_even, perm_mask_odd;
4492 gimple perm_stmt;
4493 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4494 unsigned int i, j, log_length = exact_log2 (length);
4495 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4496 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4497
4498 result_chain->quick_grow (length);
4499 memcpy (result_chain->address (), dr_chain.address (),
4500 length * sizeof (tree));
4501
4502 for (i = 0; i < nelt; ++i)
4503 sel[i] = i * 2;
4504 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4505 gcc_assert (perm_mask_even != NULL);
4506
4507 for (i = 0; i < nelt; ++i)
4508 sel[i] = i * 2 + 1;
4509 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4510 gcc_assert (perm_mask_odd != NULL);
4511
4512 for (i = 0; i < log_length; i++)
4513 {
4514 for (j = 0; j < length; j += 2)
4515 {
4516 first_vect = dr_chain[j];
4517 second_vect = dr_chain[j+1];
4518
4519 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4520 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4521 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4522 first_vect, second_vect,
4523 perm_mask_even);
4524 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4525 (*result_chain)[j/2] = data_ref;
4526
4527 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4528 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4529 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4530 first_vect, second_vect,
4531 perm_mask_odd);
4532 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4533 (*result_chain)[j/2+length/2] = data_ref;
4534 }
4535 memcpy (dr_chain.address (), result_chain->address (),
4536 length * sizeof (tree));
4537 }
4538 }
4539
4540
4541 /* Function vect_transform_grouped_load.
4542
4543 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4544 to perform their permutation and ascribe the result vectorized statements to
4545 the scalar statements.
4546 */
4547
4548 void
4549 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4550 gimple_stmt_iterator *gsi)
4551 {
4552 vec<tree> result_chain = vNULL;
4553
4554 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4555 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4556 vectors, that are ready for vector computation. */
4557 result_chain.create (size);
4558 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4559 vect_record_grouped_load_vectors (stmt, result_chain);
4560 result_chain.release ();
4561 }
4562
4563 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4564 generated as part of the vectorization of STMT. Assign the statement
4565 for each vector to the associated scalar statement. */
4566
4567 void
4568 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4569 {
4570 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4571 gimple next_stmt, new_stmt;
4572 unsigned int i, gap_count;
4573 tree tmp_data_ref;
4574
4575 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4576 Since we scan the chain starting from it's first node, their order
4577 corresponds the order of data-refs in RESULT_CHAIN. */
4578 next_stmt = first_stmt;
4579 gap_count = 1;
4580 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4581 {
4582 if (!next_stmt)
4583 break;
4584
4585 /* Skip the gaps. Loads created for the gaps will be removed by dead
4586 code elimination pass later. No need to check for the first stmt in
4587 the group, since it always exists.
4588 GROUP_GAP is the number of steps in elements from the previous
4589 access (if there is no gap GROUP_GAP is 1). We skip loads that
4590 correspond to the gaps. */
4591 if (next_stmt != first_stmt
4592 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4593 {
4594 gap_count++;
4595 continue;
4596 }
4597
4598 while (next_stmt)
4599 {
4600 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4601 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4602 copies, and we put the new vector statement in the first available
4603 RELATED_STMT. */
4604 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4605 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4606 else
4607 {
4608 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4609 {
4610 gimple prev_stmt =
4611 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4612 gimple rel_stmt =
4613 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4614 while (rel_stmt)
4615 {
4616 prev_stmt = rel_stmt;
4617 rel_stmt =
4618 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4619 }
4620
4621 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4622 new_stmt;
4623 }
4624 }
4625
4626 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4627 gap_count = 1;
4628 /* If NEXT_STMT accesses the same DR as the previous statement,
4629 put the same TMP_DATA_REF as its vectorized statement; otherwise
4630 get the next data-ref from RESULT_CHAIN. */
4631 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4632 break;
4633 }
4634 }
4635 }
4636
4637 /* Function vect_force_dr_alignment_p.
4638
4639 Returns whether the alignment of a DECL can be forced to be aligned
4640 on ALIGNMENT bit boundary. */
4641
4642 bool
4643 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4644 {
4645 if (TREE_CODE (decl) != VAR_DECL)
4646 return false;
4647
4648 /* We cannot change alignment of common or external symbols as another
4649 translation unit may contain a definition with lower alignment.
4650 The rules of common symbol linking mean that the definition
4651 will override the common symbol. The same is true for constant
4652 pool entries which may be shared and are not properly merged
4653 by LTO. */
4654 if (DECL_EXTERNAL (decl)
4655 || DECL_COMMON (decl)
4656 || DECL_IN_CONSTANT_POOL (decl))
4657 return false;
4658
4659 if (TREE_ASM_WRITTEN (decl))
4660 return false;
4661
4662 /* Do not override the alignment as specified by the ABI when the used
4663 attribute is set. */
4664 if (DECL_PRESERVE_P (decl))
4665 return false;
4666
4667 /* Do not override explicit alignment set by the user when an explicit
4668 section name is also used. This is a common idiom used by many
4669 software projects. */
4670 if (DECL_SECTION_NAME (decl) != NULL_TREE
4671 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4672 return false;
4673
4674 if (TREE_STATIC (decl))
4675 return (alignment <= MAX_OFILE_ALIGNMENT);
4676 else
4677 return (alignment <= MAX_STACK_ALIGNMENT);
4678 }
4679
4680
4681 /* Return whether the data reference DR is supported with respect to its
4682 alignment.
4683 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4684 it is aligned, i.e., check if it is possible to vectorize it with different
4685 alignment. */
4686
4687 enum dr_alignment_support
4688 vect_supportable_dr_alignment (struct data_reference *dr,
4689 bool check_aligned_accesses)
4690 {
4691 gimple stmt = DR_STMT (dr);
4692 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4693 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4694 enum machine_mode mode = TYPE_MODE (vectype);
4695 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4696 struct loop *vect_loop = NULL;
4697 bool nested_in_vect_loop = false;
4698
4699 if (aligned_access_p (dr) && !check_aligned_accesses)
4700 return dr_aligned;
4701
4702 if (loop_vinfo)
4703 {
4704 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4705 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4706 }
4707
4708 /* Possibly unaligned access. */
4709
4710 /* We can choose between using the implicit realignment scheme (generating
4711 a misaligned_move stmt) and the explicit realignment scheme (generating
4712 aligned loads with a REALIGN_LOAD). There are two variants to the
4713 explicit realignment scheme: optimized, and unoptimized.
4714 We can optimize the realignment only if the step between consecutive
4715 vector loads is equal to the vector size. Since the vector memory
4716 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4717 is guaranteed that the misalignment amount remains the same throughout the
4718 execution of the vectorized loop. Therefore, we can create the
4719 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4720 at the loop preheader.
4721
4722 However, in the case of outer-loop vectorization, when vectorizing a
4723 memory access in the inner-loop nested within the LOOP that is now being
4724 vectorized, while it is guaranteed that the misalignment of the
4725 vectorized memory access will remain the same in different outer-loop
4726 iterations, it is *not* guaranteed that is will remain the same throughout
4727 the execution of the inner-loop. This is because the inner-loop advances
4728 with the original scalar step (and not in steps of VS). If the inner-loop
4729 step happens to be a multiple of VS, then the misalignment remains fixed
4730 and we can use the optimized realignment scheme. For example:
4731
4732 for (i=0; i<N; i++)
4733 for (j=0; j<M; j++)
4734 s += a[i+j];
4735
4736 When vectorizing the i-loop in the above example, the step between
4737 consecutive vector loads is 1, and so the misalignment does not remain
4738 fixed across the execution of the inner-loop, and the realignment cannot
4739 be optimized (as illustrated in the following pseudo vectorized loop):
4740
4741 for (i=0; i<N; i+=4)
4742 for (j=0; j<M; j++){
4743 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4744 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4745 // (assuming that we start from an aligned address).
4746 }
4747
4748 We therefore have to use the unoptimized realignment scheme:
4749
4750 for (i=0; i<N; i+=4)
4751 for (j=k; j<M; j+=4)
4752 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4753 // that the misalignment of the initial address is
4754 // 0).
4755
4756 The loop can then be vectorized as follows:
4757
4758 for (k=0; k<4; k++){
4759 rt = get_realignment_token (&vp[k]);
4760 for (i=0; i<N; i+=4){
4761 v1 = vp[i+k];
4762 for (j=k; j<M; j+=4){
4763 v2 = vp[i+j+VS-1];
4764 va = REALIGN_LOAD <v1,v2,rt>;
4765 vs += va;
4766 v1 = v2;
4767 }
4768 }
4769 } */
4770
4771 if (DR_IS_READ (dr))
4772 {
4773 bool is_packed = false;
4774 tree type = (TREE_TYPE (DR_REF (dr)));
4775
4776 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4777 && (!targetm.vectorize.builtin_mask_for_load
4778 || targetm.vectorize.builtin_mask_for_load ()))
4779 {
4780 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4781 if ((nested_in_vect_loop
4782 && (TREE_INT_CST_LOW (DR_STEP (dr))
4783 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4784 || !loop_vinfo)
4785 return dr_explicit_realign;
4786 else
4787 return dr_explicit_realign_optimized;
4788 }
4789 if (!known_alignment_for_access_p (dr))
4790 is_packed = not_size_aligned (DR_REF (dr));
4791
4792 if (targetm.vectorize.
4793 support_vector_misalignment (mode, type,
4794 DR_MISALIGNMENT (dr), is_packed))
4795 /* Can't software pipeline the loads, but can at least do them. */
4796 return dr_unaligned_supported;
4797 }
4798 else
4799 {
4800 bool is_packed = false;
4801 tree type = (TREE_TYPE (DR_REF (dr)));
4802
4803 if (!known_alignment_for_access_p (dr))
4804 is_packed = not_size_aligned (DR_REF (dr));
4805
4806 if (targetm.vectorize.
4807 support_vector_misalignment (mode, type,
4808 DR_MISALIGNMENT (dr), is_packed))
4809 return dr_unaligned_supported;
4810 }
4811
4812 /* Unsupported. */
4813 return dr_unaligned_unsupported;
4814 }