14593b5db3ade7e0b4ca7078fcbdcbee75736101
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
44
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
47
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
51 {
52 enum machine_mode mode, array_mode;
53 bool limit_p;
54
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
59
60 if (array_mode == BLKmode)
61 {
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
67 }
68
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
70 {
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
76 }
77
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
82
83 return true;
84 }
85
86
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
103
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
107 {
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
110
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
112
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
118 {
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
120
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
124 }
125
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
129 }
130
131
132 /* Find the place of the data-ref in STMT in the interleaving chain that starts
133 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
134
135 int
136 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
137 {
138 gimple next_stmt = first_stmt;
139 int result = 0;
140
141 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
142 return -1;
143
144 while (next_stmt && next_stmt != stmt)
145 {
146 result++;
147 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
148 }
149
150 if (next_stmt)
151 return result;
152 else
153 return -1;
154 }
155
156
157 /* Check if data references pointed by DR_I and DR_J are same or
158 belong to same interleaving group. Return FALSE if drs are
159 different, otherwise return TRUE. */
160
161 static bool
162 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
163 {
164 gimple stmt_i = DR_STMT (dr_i);
165 gimple stmt_j = DR_STMT (dr_j);
166
167 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
168 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
169 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
170 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
171 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
172 return true;
173 else
174 return false;
175 }
176
177 /* If address ranges represented by DDR_I and DDR_J are equal,
178 return TRUE, otherwise return FALSE. */
179
180 static bool
181 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
182 {
183 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
184 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
185 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
186 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
187 return true;
188 else
189 return false;
190 }
191
192 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
193 tested at run-time. Return TRUE if DDR was successfully inserted.
194 Return false if versioning is not supported. */
195
196 static bool
197 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
198 {
199 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
200
201 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
202 return false;
203
204 if (dump_enabled_p ())
205 {
206 dump_printf_loc (MSG_NOTE, vect_location,
207 "mark for run-time aliasing test between ");
208 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
209 dump_printf (MSG_NOTE, " and ");
210 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
211 }
212
213 if (optimize_loop_nest_for_size_p (loop))
214 {
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not supported when optimizing for size.");
218 return false;
219 }
220
221 /* FORNOW: We don't support versioning with outer-loop vectorization. */
222 if (loop->inner)
223 {
224 if (dump_enabled_p ())
225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
226 "versioning not yet supported for outer-loops.");
227 return false;
228 }
229
230 /* FORNOW: We don't support creating runtime alias tests for non-constant
231 step. */
232 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
233 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
234 {
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
237 "versioning not yet supported for non-constant "
238 "step");
239 return false;
240 }
241
242 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
243 return true;
244 }
245
246
247 /* Function vect_analyze_data_ref_dependence.
248
249 Return TRUE if there (might) exist a dependence between a memory-reference
250 DRA and a memory-reference DRB. When versioning for alias may check a
251 dependence at run-time, return FALSE. Adjust *MAX_VF according to
252 the data dependence. */
253
254 static bool
255 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
256 loop_vec_info loop_vinfo, int *max_vf)
257 {
258 unsigned int i;
259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
260 struct data_reference *dra = DDR_A (ddr);
261 struct data_reference *drb = DDR_B (ddr);
262 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
263 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
264 lambda_vector dist_v;
265 unsigned int loop_depth;
266
267 /* In loop analysis all data references should be vectorizable. */
268 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
269 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
270 gcc_unreachable ();
271
272 /* Independent data accesses. */
273 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
274 return false;
275
276 if (dra == drb
277 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
278 return false;
279
280 /* Unknown data dependence. */
281 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
282 {
283 if (STMT_VINFO_GATHER_P (stmtinfo_a)
284 || STMT_VINFO_GATHER_P (stmtinfo_b))
285 {
286 if (dump_enabled_p ())
287 {
288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
289 "versioning for alias not supported for: "
290 "can't determine dependence between ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (dra));
293 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
294 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
295 DR_REF (drb));
296 }
297 return false;
298 }
299
300 if (dump_enabled_p ())
301 {
302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
303 "versioning for alias required: "
304 "can't determine dependence between ");
305 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
306 DR_REF (dra));
307 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
308 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
309 DR_REF (drb));
310 }
311
312 /* Add to list of ddrs that need to be tested at run-time. */
313 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
314 }
315
316 /* Known data dependence. */
317 if (DDR_NUM_DIST_VECTS (ddr) == 0)
318 {
319 if (STMT_VINFO_GATHER_P (stmtinfo_a)
320 || STMT_VINFO_GATHER_P (stmtinfo_b))
321 {
322 if (dump_enabled_p ())
323 {
324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
325 "versioning for alias not supported for: "
326 "bad dist vector for ");
327 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
328 DR_REF (dra));
329 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
330 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
331 DR_REF (drb));
332 }
333 return false;
334 }
335
336 if (dump_enabled_p ())
337 {
338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
339 "versioning for alias required: "
340 "bad dist vector for ");
341 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
342 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
343 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
344 }
345 /* Add to list of ddrs that need to be tested at run-time. */
346 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
347 }
348
349 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
350 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
351 {
352 int dist = dist_v[loop_depth];
353
354 if (dump_enabled_p ())
355 dump_printf_loc (MSG_NOTE, vect_location,
356 "dependence distance = %d.", dist);
357
358 if (dist == 0)
359 {
360 if (dump_enabled_p ())
361 {
362 dump_printf_loc (MSG_NOTE, vect_location,
363 "dependence distance == 0 between ");
364 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
365 dump_printf (MSG_NOTE, " and ");
366 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
367 }
368
369 /* For interleaving, mark that there is a read-write dependency if
370 necessary. We check before that one of the data-refs is store. */
371 if (DR_IS_READ (dra))
372 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
373 else
374 {
375 if (DR_IS_READ (drb))
376 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
377 }
378
379 continue;
380 }
381
382 if (dist > 0 && DDR_REVERSED_P (ddr))
383 {
384 /* If DDR_REVERSED_P the order of the data-refs in DDR was
385 reversed (to make distance vector positive), and the actual
386 distance is negative. */
387 if (dump_enabled_p ())
388 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
389 "dependence distance negative.");
390 continue;
391 }
392
393 if (abs (dist) >= 2
394 && abs (dist) < *max_vf)
395 {
396 /* The dependence distance requires reduction of the maximal
397 vectorization factor. */
398 *max_vf = abs (dist);
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location,
401 "adjusting maximal vectorization factor to %i",
402 *max_vf);
403 }
404
405 if (abs (dist) >= *max_vf)
406 {
407 /* Dependence distance does not create dependence, as far as
408 vectorization is concerned, in this case. */
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_NOTE, vect_location,
411 "dependence distance >= VF.");
412 continue;
413 }
414
415 if (dump_enabled_p ())
416 {
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "not vectorized, possible dependence "
419 "between data-refs ");
420 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
421 dump_printf (MSG_NOTE, " and ");
422 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
423 }
424
425 return true;
426 }
427
428 return false;
429 }
430
431 /* Function vect_analyze_data_ref_dependences.
432
433 Examine all the data references in the loop, and make sure there do not
434 exist any data dependences between them. Set *MAX_VF according to
435 the maximum vectorization factor the data dependences allow. */
436
437 bool
438 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
439 {
440 unsigned int i;
441 struct data_dependence_relation *ddr;
442
443 if (dump_enabled_p ())
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "=== vect_analyze_data_ref_dependences ===");
446
447 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
448 &LOOP_VINFO_DDRS (loop_vinfo),
449 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
450 return false;
451
452 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
453 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
454 return false;
455
456 return true;
457 }
458
459
460 /* Function vect_slp_analyze_data_ref_dependence.
461
462 Return TRUE if there (might) exist a dependence between a memory-reference
463 DRA and a memory-reference DRB. When versioning for alias may check a
464 dependence at run-time, return FALSE. Adjust *MAX_VF according to
465 the data dependence. */
466
467 static bool
468 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
469 {
470 struct data_reference *dra = DDR_A (ddr);
471 struct data_reference *drb = DDR_B (ddr);
472
473 /* We need to check dependences of statements marked as unvectorizable
474 as well, they still can prohibit vectorization. */
475
476 /* Independent data accesses. */
477 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
478 return false;
479
480 if (dra == drb)
481 return false;
482
483 /* Read-read is OK. */
484 if (DR_IS_READ (dra) && DR_IS_READ (drb))
485 return false;
486
487 /* If dra and drb are part of the same interleaving chain consider
488 them independent. */
489 if (STMT_VINFO_GROUPED_ACCESS (vinfo_for_stmt (DR_STMT (dra)))
490 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (dra)))
491 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (DR_STMT (drb)))))
492 return false;
493
494 /* Unknown data dependence. */
495 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
496 {
497 gimple earlier_stmt;
498
499 if (dump_enabled_p ())
500 {
501 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
502 "can't determine dependence between ");
503 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
504 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
505 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
506 }
507
508 /* We do not vectorize basic blocks with write-write dependencies. */
509 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
510 return true;
511
512 /* Check that it's not a load-after-store dependence. */
513 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
514 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
515 return true;
516
517 return false;
518 }
519
520 if (dump_enabled_p ())
521 {
522 dump_printf_loc (MSG_NOTE, vect_location,
523 "determined dependence between ");
524 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
525 dump_printf (MSG_NOTE, " and ");
526 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
527 }
528
529 /* Do not vectorize basic blocks with write-write dependences. */
530 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
531 return true;
532
533 /* Check dependence between DRA and DRB for basic block vectorization.
534 If the accesses share same bases and offsets, we can compare their initial
535 constant offsets to decide whether they differ or not. In case of a read-
536 write dependence we check that the load is before the store to ensure that
537 vectorization will not change the order of the accesses. */
538
539 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
540 gimple earlier_stmt;
541
542 /* Check that the data-refs have same bases and offsets. If not, we can't
543 determine if they are dependent. */
544 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
545 || !dr_equal_offsets_p (dra, drb))
546 return true;
547
548 /* Check the types. */
549 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
550 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
551
552 if (type_size_a != type_size_b
553 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
554 TREE_TYPE (DR_REF (drb))))
555 return true;
556
557 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
558 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
559
560 /* Two different locations - no dependence. */
561 if (init_a != init_b)
562 return false;
563
564 /* We have a read-write dependence. Check that the load is before the store.
565 When we vectorize basic blocks, vector load can be only before
566 corresponding scalar load, and vector store can be only after its
567 corresponding scalar store. So the order of the acceses is preserved in
568 case the load is before the store. */
569 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
570 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
571 return false;
572
573 return true;
574 }
575
576
577 /* Function vect_analyze_data_ref_dependences.
578
579 Examine all the data references in the basic-block, and make sure there
580 do not exist any data dependences between them. Set *MAX_VF according to
581 the maximum vectorization factor the data dependences allow. */
582
583 bool
584 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
585 {
586 struct data_dependence_relation *ddr;
587 unsigned int i;
588
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "=== vect_slp_analyze_data_ref_dependences ===");
592
593 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
594 &BB_VINFO_DDRS (bb_vinfo),
595 vNULL, true))
596 return false;
597
598 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
599 if (vect_slp_analyze_data_ref_dependence (ddr))
600 return false;
601
602 return true;
603 }
604
605
606 /* Function vect_compute_data_ref_alignment
607
608 Compute the misalignment of the data reference DR.
609
610 Output:
611 1. If during the misalignment computation it is found that the data reference
612 cannot be vectorized then false is returned.
613 2. DR_MISALIGNMENT (DR) is defined.
614
615 FOR NOW: No analysis is actually performed. Misalignment is calculated
616 only for trivial cases. TODO. */
617
618 static bool
619 vect_compute_data_ref_alignment (struct data_reference *dr)
620 {
621 gimple stmt = DR_STMT (dr);
622 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
623 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
624 struct loop *loop = NULL;
625 tree ref = DR_REF (dr);
626 tree vectype;
627 tree base, base_addr;
628 bool base_aligned;
629 tree misalign;
630 tree aligned_to, alignment;
631
632 if (dump_enabled_p ())
633 dump_printf_loc (MSG_NOTE, vect_location,
634 "vect_compute_data_ref_alignment:");
635
636 if (loop_vinfo)
637 loop = LOOP_VINFO_LOOP (loop_vinfo);
638
639 /* Initialize misalignment to unknown. */
640 SET_DR_MISALIGNMENT (dr, -1);
641
642 /* Strided loads perform only component accesses, misalignment information
643 is irrelevant for them. */
644 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
645 return true;
646
647 misalign = DR_INIT (dr);
648 aligned_to = DR_ALIGNED_TO (dr);
649 base_addr = DR_BASE_ADDRESS (dr);
650 vectype = STMT_VINFO_VECTYPE (stmt_info);
651
652 /* In case the dataref is in an inner-loop of the loop that is being
653 vectorized (LOOP), we use the base and misalignment information
654 relative to the outer-loop (LOOP). This is ok only if the misalignment
655 stays the same throughout the execution of the inner-loop, which is why
656 we have to check that the stride of the dataref in the inner-loop evenly
657 divides by the vector size. */
658 if (loop && nested_in_vect_loop_p (loop, stmt))
659 {
660 tree step = DR_STEP (dr);
661 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
662
663 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
664 {
665 if (dump_enabled_p ())
666 dump_printf_loc (MSG_NOTE, vect_location,
667 "inner step divides the vector-size.");
668 misalign = STMT_VINFO_DR_INIT (stmt_info);
669 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
670 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
671 }
672 else
673 {
674 if (dump_enabled_p ())
675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
676 "inner step doesn't divide the vector-size.");
677 misalign = NULL_TREE;
678 }
679 }
680
681 /* Similarly, if we're doing basic-block vectorization, we can only use
682 base and misalignment information relative to an innermost loop if the
683 misalignment stays the same throughout the execution of the loop.
684 As above, this is the case if the stride of the dataref evenly divides
685 by the vector size. */
686 if (!loop)
687 {
688 tree step = DR_STEP (dr);
689 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
690
691 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
692 {
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
695 "SLP: step doesn't divide the vector-size.");
696 misalign = NULL_TREE;
697 }
698 }
699
700 base = build_fold_indirect_ref (base_addr);
701 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
702
703 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
704 || !misalign)
705 {
706 if (dump_enabled_p ())
707 {
708 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
709 "Unknown alignment for access: ");
710 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
711 }
712 return true;
713 }
714
715 if ((DECL_P (base)
716 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
717 alignment) >= 0)
718 || (TREE_CODE (base_addr) == SSA_NAME
719 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
720 TREE_TYPE (base_addr)))),
721 alignment) >= 0)
722 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
723 base_aligned = true;
724 else
725 base_aligned = false;
726
727 if (!base_aligned)
728 {
729 /* Do not change the alignment of global variables here if
730 flag_section_anchors is enabled as we already generated
731 RTL for other functions. Most global variables should
732 have been aligned during the IPA increase_alignment pass. */
733 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
734 || (TREE_STATIC (base) && flag_section_anchors))
735 {
736 if (dump_enabled_p ())
737 {
738 dump_printf_loc (MSG_NOTE, vect_location,
739 "can't force alignment of ref: ");
740 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
741 }
742 return true;
743 }
744
745 /* Force the alignment of the decl.
746 NOTE: This is the only change to the code we make during
747 the analysis phase, before deciding to vectorize the loop. */
748 if (dump_enabled_p ())
749 {
750 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
751 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
752 }
753
754 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
755 DECL_USER_ALIGN (base) = 1;
756 }
757
758 /* At this point we assume that the base is aligned. */
759 gcc_assert (base_aligned
760 || (TREE_CODE (base) == VAR_DECL
761 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
762
763 /* If this is a backward running DR then first access in the larger
764 vectype actually is N-1 elements before the address in the DR.
765 Adjust misalign accordingly. */
766 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
767 {
768 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
769 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
770 otherwise we wouldn't be here. */
771 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
772 /* PLUS because DR_STEP was negative. */
773 misalign = size_binop (PLUS_EXPR, misalign, offset);
774 }
775
776 /* Modulo alignment. */
777 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
778
779 if (!host_integerp (misalign, 1))
780 {
781 /* Negative or overflowed misalignment value. */
782 if (dump_enabled_p ())
783 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
784 "unexpected misalign value");
785 return false;
786 }
787
788 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
789
790 if (dump_enabled_p ())
791 {
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
794 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
795 }
796
797 return true;
798 }
799
800
801 /* Function vect_compute_data_refs_alignment
802
803 Compute the misalignment of data references in the loop.
804 Return FALSE if a data reference is found that cannot be vectorized. */
805
806 static bool
807 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
808 bb_vec_info bb_vinfo)
809 {
810 vec<data_reference_p> datarefs;
811 struct data_reference *dr;
812 unsigned int i;
813
814 if (loop_vinfo)
815 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
816 else
817 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
818
819 FOR_EACH_VEC_ELT (datarefs, i, dr)
820 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
821 && !vect_compute_data_ref_alignment (dr))
822 {
823 if (bb_vinfo)
824 {
825 /* Mark unsupported statement as unvectorizable. */
826 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
827 continue;
828 }
829 else
830 return false;
831 }
832
833 return true;
834 }
835
836
837 /* Function vect_update_misalignment_for_peel
838
839 DR - the data reference whose misalignment is to be adjusted.
840 DR_PEEL - the data reference whose misalignment is being made
841 zero in the vector loop by the peel.
842 NPEEL - the number of iterations in the peel loop if the misalignment
843 of DR_PEEL is known at compile time. */
844
845 static void
846 vect_update_misalignment_for_peel (struct data_reference *dr,
847 struct data_reference *dr_peel, int npeel)
848 {
849 unsigned int i;
850 vec<dr_p> same_align_drs;
851 struct data_reference *current_dr;
852 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
853 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
854 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
855 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
856
857 /* For interleaved data accesses the step in the loop must be multiplied by
858 the size of the interleaving group. */
859 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
860 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
861 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
862 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
863
864 /* It can be assumed that the data refs with the same alignment as dr_peel
865 are aligned in the vector loop. */
866 same_align_drs
867 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
868 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
869 {
870 if (current_dr != dr)
871 continue;
872 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
873 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
874 SET_DR_MISALIGNMENT (dr, 0);
875 return;
876 }
877
878 if (known_alignment_for_access_p (dr)
879 && known_alignment_for_access_p (dr_peel))
880 {
881 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
882 int misal = DR_MISALIGNMENT (dr);
883 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
884 misal += negative ? -npeel * dr_size : npeel * dr_size;
885 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
886 SET_DR_MISALIGNMENT (dr, misal);
887 return;
888 }
889
890 if (dump_enabled_p ())
891 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
892 SET_DR_MISALIGNMENT (dr, -1);
893 }
894
895
896 /* Function vect_verify_datarefs_alignment
897
898 Return TRUE if all data references in the loop can be
899 handled with respect to alignment. */
900
901 bool
902 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
903 {
904 vec<data_reference_p> datarefs;
905 struct data_reference *dr;
906 enum dr_alignment_support supportable_dr_alignment;
907 unsigned int i;
908
909 if (loop_vinfo)
910 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
911 else
912 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
913
914 FOR_EACH_VEC_ELT (datarefs, i, dr)
915 {
916 gimple stmt = DR_STMT (dr);
917 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
918
919 if (!STMT_VINFO_RELEVANT_P (stmt_info))
920 continue;
921
922 /* For interleaving, only the alignment of the first access matters.
923 Skip statements marked as not vectorizable. */
924 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
925 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
926 || !STMT_VINFO_VECTORIZABLE (stmt_info))
927 continue;
928
929 /* Strided loads perform only component accesses, alignment is
930 irrelevant for them. */
931 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
932 continue;
933
934 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
935 if (!supportable_dr_alignment)
936 {
937 if (dump_enabled_p ())
938 {
939 if (DR_IS_READ (dr))
940 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
941 "not vectorized: unsupported unaligned load.");
942 else
943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
944 "not vectorized: unsupported unaligned "
945 "store.");
946
947 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
948 DR_REF (dr));
949 }
950 return false;
951 }
952 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
953 dump_printf_loc (MSG_NOTE, vect_location,
954 "Vectorizing an unaligned access.");
955 }
956 return true;
957 }
958
959 /* Given an memory reference EXP return whether its alignment is less
960 than its size. */
961
962 static bool
963 not_size_aligned (tree exp)
964 {
965 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
966 return true;
967
968 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
969 > get_object_alignment (exp));
970 }
971
972 /* Function vector_alignment_reachable_p
973
974 Return true if vector alignment for DR is reachable by peeling
975 a few loop iterations. Return false otherwise. */
976
977 static bool
978 vector_alignment_reachable_p (struct data_reference *dr)
979 {
980 gimple stmt = DR_STMT (dr);
981 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
982 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
983
984 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
985 {
986 /* For interleaved access we peel only if number of iterations in
987 the prolog loop ({VF - misalignment}), is a multiple of the
988 number of the interleaved accesses. */
989 int elem_size, mis_in_elements;
990 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
991
992 /* FORNOW: handle only known alignment. */
993 if (!known_alignment_for_access_p (dr))
994 return false;
995
996 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
997 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
998
999 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1000 return false;
1001 }
1002
1003 /* If misalignment is known at the compile time then allow peeling
1004 only if natural alignment is reachable through peeling. */
1005 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1006 {
1007 HOST_WIDE_INT elmsize =
1008 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1009 if (dump_enabled_p ())
1010 {
1011 dump_printf_loc (MSG_NOTE, vect_location,
1012 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1013 dump_printf (MSG_NOTE,
1014 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1015 }
1016 if (DR_MISALIGNMENT (dr) % elmsize)
1017 {
1018 if (dump_enabled_p ())
1019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1020 "data size does not divide the misalignment.\n");
1021 return false;
1022 }
1023 }
1024
1025 if (!known_alignment_for_access_p (dr))
1026 {
1027 tree type = TREE_TYPE (DR_REF (dr));
1028 bool is_packed = not_size_aligned (DR_REF (dr));
1029 if (dump_enabled_p ())
1030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1031 "Unknown misalignment, is_packed = %d",is_packed);
1032 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1033 return true;
1034 else
1035 return false;
1036 }
1037
1038 return true;
1039 }
1040
1041
1042 /* Calculate the cost of the memory access represented by DR. */
1043
1044 static void
1045 vect_get_data_access_cost (struct data_reference *dr,
1046 unsigned int *inside_cost,
1047 unsigned int *outside_cost,
1048 stmt_vector_for_cost *body_cost_vec)
1049 {
1050 gimple stmt = DR_STMT (dr);
1051 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1052 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1053 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1054 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1055 int ncopies = vf / nunits;
1056
1057 if (DR_IS_READ (dr))
1058 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1059 NULL, body_cost_vec, false);
1060 else
1061 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1062
1063 if (dump_enabled_p ())
1064 dump_printf_loc (MSG_NOTE, vect_location,
1065 "vect_get_data_access_cost: inside_cost = %d, "
1066 "outside_cost = %d.", *inside_cost, *outside_cost);
1067 }
1068
1069
1070 static hashval_t
1071 vect_peeling_hash (const void *elem)
1072 {
1073 const struct _vect_peel_info *peel_info;
1074
1075 peel_info = (const struct _vect_peel_info *) elem;
1076 return (hashval_t) peel_info->npeel;
1077 }
1078
1079
1080 static int
1081 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1082 {
1083 const struct _vect_peel_info *a, *b;
1084
1085 a = (const struct _vect_peel_info *) elem1;
1086 b = (const struct _vect_peel_info *) elem2;
1087 return (a->npeel == b->npeel);
1088 }
1089
1090
1091 /* Insert DR into peeling hash table with NPEEL as key. */
1092
1093 static void
1094 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1095 int npeel)
1096 {
1097 struct _vect_peel_info elem, *slot;
1098 void **new_slot;
1099 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1100
1101 elem.npeel = npeel;
1102 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1103 &elem);
1104 if (slot)
1105 slot->count++;
1106 else
1107 {
1108 slot = XNEW (struct _vect_peel_info);
1109 slot->npeel = npeel;
1110 slot->dr = dr;
1111 slot->count = 1;
1112 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1113 INSERT);
1114 *new_slot = slot;
1115 }
1116
1117 if (!supportable_dr_alignment && !flag_vect_cost_model)
1118 slot->count += VECT_MAX_COST;
1119 }
1120
1121
1122 /* Traverse peeling hash table to find peeling option that aligns maximum
1123 number of data accesses. */
1124
1125 static int
1126 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1127 {
1128 vect_peel_info elem = (vect_peel_info) *slot;
1129 vect_peel_extended_info max = (vect_peel_extended_info) data;
1130
1131 if (elem->count > max->peel_info.count
1132 || (elem->count == max->peel_info.count
1133 && max->peel_info.npeel > elem->npeel))
1134 {
1135 max->peel_info.npeel = elem->npeel;
1136 max->peel_info.count = elem->count;
1137 max->peel_info.dr = elem->dr;
1138 }
1139
1140 return 1;
1141 }
1142
1143
1144 /* Traverse peeling hash table and calculate cost for each peeling option.
1145 Find the one with the lowest cost. */
1146
1147 static int
1148 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1149 {
1150 vect_peel_info elem = (vect_peel_info) *slot;
1151 vect_peel_extended_info min = (vect_peel_extended_info) data;
1152 int save_misalignment, dummy;
1153 unsigned int inside_cost = 0, outside_cost = 0, i;
1154 gimple stmt = DR_STMT (elem->dr);
1155 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1156 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1157 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1158 struct data_reference *dr;
1159 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1160 int single_iter_cost;
1161
1162 prologue_cost_vec.create (2);
1163 body_cost_vec.create (2);
1164 epilogue_cost_vec.create (2);
1165
1166 FOR_EACH_VEC_ELT (datarefs, i, dr)
1167 {
1168 stmt = DR_STMT (dr);
1169 stmt_info = vinfo_for_stmt (stmt);
1170 /* For interleaving, only the alignment of the first access
1171 matters. */
1172 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1173 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1174 continue;
1175
1176 save_misalignment = DR_MISALIGNMENT (dr);
1177 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1178 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1179 &body_cost_vec);
1180 SET_DR_MISALIGNMENT (dr, save_misalignment);
1181 }
1182
1183 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1184 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1185 &dummy, single_iter_cost,
1186 &prologue_cost_vec,
1187 &epilogue_cost_vec);
1188
1189 /* Prologue and epilogue costs are added to the target model later.
1190 These costs depend only on the scalar iteration cost, the
1191 number of peeling iterations finally chosen, and the number of
1192 misaligned statements. So discard the information found here. */
1193 prologue_cost_vec.release ();
1194 epilogue_cost_vec.release ();
1195
1196 if (inside_cost < min->inside_cost
1197 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1198 {
1199 min->inside_cost = inside_cost;
1200 min->outside_cost = outside_cost;
1201 min->body_cost_vec.release ();
1202 min->body_cost_vec = body_cost_vec;
1203 min->peel_info.dr = elem->dr;
1204 min->peel_info.npeel = elem->npeel;
1205 }
1206 else
1207 body_cost_vec.release ();
1208
1209 return 1;
1210 }
1211
1212
1213 /* Choose best peeling option by traversing peeling hash table and either
1214 choosing an option with the lowest cost (if cost model is enabled) or the
1215 option that aligns as many accesses as possible. */
1216
1217 static struct data_reference *
1218 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1219 unsigned int *npeel,
1220 stmt_vector_for_cost *body_cost_vec)
1221 {
1222 struct _vect_peel_extended_info res;
1223
1224 res.peel_info.dr = NULL;
1225 res.body_cost_vec = stmt_vector_for_cost();
1226
1227 if (flag_vect_cost_model)
1228 {
1229 res.inside_cost = INT_MAX;
1230 res.outside_cost = INT_MAX;
1231 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1232 vect_peeling_hash_get_lowest_cost, &res);
1233 }
1234 else
1235 {
1236 res.peel_info.count = 0;
1237 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1238 vect_peeling_hash_get_most_frequent, &res);
1239 }
1240
1241 *npeel = res.peel_info.npeel;
1242 *body_cost_vec = res.body_cost_vec;
1243 return res.peel_info.dr;
1244 }
1245
1246
1247 /* Function vect_enhance_data_refs_alignment
1248
1249 This pass will use loop versioning and loop peeling in order to enhance
1250 the alignment of data references in the loop.
1251
1252 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1253 original loop is to be vectorized. Any other loops that are created by
1254 the transformations performed in this pass - are not supposed to be
1255 vectorized. This restriction will be relaxed.
1256
1257 This pass will require a cost model to guide it whether to apply peeling
1258 or versioning or a combination of the two. For example, the scheme that
1259 intel uses when given a loop with several memory accesses, is as follows:
1260 choose one memory access ('p') which alignment you want to force by doing
1261 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1262 other accesses are not necessarily aligned, or (2) use loop versioning to
1263 generate one loop in which all accesses are aligned, and another loop in
1264 which only 'p' is necessarily aligned.
1265
1266 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1267 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1268 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1269
1270 Devising a cost model is the most critical aspect of this work. It will
1271 guide us on which access to peel for, whether to use loop versioning, how
1272 many versions to create, etc. The cost model will probably consist of
1273 generic considerations as well as target specific considerations (on
1274 powerpc for example, misaligned stores are more painful than misaligned
1275 loads).
1276
1277 Here are the general steps involved in alignment enhancements:
1278
1279 -- original loop, before alignment analysis:
1280 for (i=0; i<N; i++){
1281 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1282 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1283 }
1284
1285 -- After vect_compute_data_refs_alignment:
1286 for (i=0; i<N; i++){
1287 x = q[i]; # DR_MISALIGNMENT(q) = 3
1288 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1289 }
1290
1291 -- Possibility 1: we do loop versioning:
1292 if (p is aligned) {
1293 for (i=0; i<N; i++){ # loop 1A
1294 x = q[i]; # DR_MISALIGNMENT(q) = 3
1295 p[i] = y; # DR_MISALIGNMENT(p) = 0
1296 }
1297 }
1298 else {
1299 for (i=0; i<N; i++){ # loop 1B
1300 x = q[i]; # DR_MISALIGNMENT(q) = 3
1301 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1302 }
1303 }
1304
1305 -- Possibility 2: we do loop peeling:
1306 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1307 x = q[i];
1308 p[i] = y;
1309 }
1310 for (i = 3; i < N; i++){ # loop 2A
1311 x = q[i]; # DR_MISALIGNMENT(q) = 0
1312 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1313 }
1314
1315 -- Possibility 3: combination of loop peeling and versioning:
1316 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1317 x = q[i];
1318 p[i] = y;
1319 }
1320 if (p is aligned) {
1321 for (i = 3; i<N; i++){ # loop 3A
1322 x = q[i]; # DR_MISALIGNMENT(q) = 0
1323 p[i] = y; # DR_MISALIGNMENT(p) = 0
1324 }
1325 }
1326 else {
1327 for (i = 3; i<N; i++){ # loop 3B
1328 x = q[i]; # DR_MISALIGNMENT(q) = 0
1329 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1330 }
1331 }
1332
1333 These loops are later passed to loop_transform to be vectorized. The
1334 vectorizer will use the alignment information to guide the transformation
1335 (whether to generate regular loads/stores, or with special handling for
1336 misalignment). */
1337
1338 bool
1339 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1340 {
1341 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1342 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1343 enum dr_alignment_support supportable_dr_alignment;
1344 struct data_reference *dr0 = NULL, *first_store = NULL;
1345 struct data_reference *dr;
1346 unsigned int i, j;
1347 bool do_peeling = false;
1348 bool do_versioning = false;
1349 bool stat;
1350 gimple stmt;
1351 stmt_vec_info stmt_info;
1352 int vect_versioning_for_alias_required;
1353 unsigned int npeel = 0;
1354 bool all_misalignments_unknown = true;
1355 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1356 unsigned possible_npeel_number = 1;
1357 tree vectype;
1358 unsigned int nelements, mis, same_align_drs_max = 0;
1359 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1360
1361 if (dump_enabled_p ())
1362 dump_printf_loc (MSG_NOTE, vect_location,
1363 "=== vect_enhance_data_refs_alignment ===");
1364
1365 /* While cost model enhancements are expected in the future, the high level
1366 view of the code at this time is as follows:
1367
1368 A) If there is a misaligned access then see if peeling to align
1369 this access can make all data references satisfy
1370 vect_supportable_dr_alignment. If so, update data structures
1371 as needed and return true.
1372
1373 B) If peeling wasn't possible and there is a data reference with an
1374 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1375 then see if loop versioning checks can be used to make all data
1376 references satisfy vect_supportable_dr_alignment. If so, update
1377 data structures as needed and return true.
1378
1379 C) If neither peeling nor versioning were successful then return false if
1380 any data reference does not satisfy vect_supportable_dr_alignment.
1381
1382 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1383
1384 Note, Possibility 3 above (which is peeling and versioning together) is not
1385 being done at this time. */
1386
1387 /* (1) Peeling to force alignment. */
1388
1389 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1390 Considerations:
1391 + How many accesses will become aligned due to the peeling
1392 - How many accesses will become unaligned due to the peeling,
1393 and the cost of misaligned accesses.
1394 - The cost of peeling (the extra runtime checks, the increase
1395 in code size). */
1396
1397 FOR_EACH_VEC_ELT (datarefs, i, dr)
1398 {
1399 stmt = DR_STMT (dr);
1400 stmt_info = vinfo_for_stmt (stmt);
1401
1402 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1403 continue;
1404
1405 /* For interleaving, only the alignment of the first access
1406 matters. */
1407 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1408 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1409 continue;
1410
1411 /* For invariant accesses there is nothing to enhance. */
1412 if (integer_zerop (DR_STEP (dr)))
1413 continue;
1414
1415 /* Strided loads perform only component accesses, alignment is
1416 irrelevant for them. */
1417 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1418 continue;
1419
1420 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1421 do_peeling = vector_alignment_reachable_p (dr);
1422 if (do_peeling)
1423 {
1424 if (known_alignment_for_access_p (dr))
1425 {
1426 unsigned int npeel_tmp;
1427 bool negative = tree_int_cst_compare (DR_STEP (dr),
1428 size_zero_node) < 0;
1429
1430 /* Save info about DR in the hash table. */
1431 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1432 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1433 htab_create (1, vect_peeling_hash,
1434 vect_peeling_hash_eq, free);
1435
1436 vectype = STMT_VINFO_VECTYPE (stmt_info);
1437 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1438 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1439 TREE_TYPE (DR_REF (dr))));
1440 npeel_tmp = (negative
1441 ? (mis - nelements) : (nelements - mis))
1442 & (nelements - 1);
1443
1444 /* For multiple types, it is possible that the bigger type access
1445 will have more than one peeling option. E.g., a loop with two
1446 types: one of size (vector size / 4), and the other one of
1447 size (vector size / 8). Vectorization factor will 8. If both
1448 access are misaligned by 3, the first one needs one scalar
1449 iteration to be aligned, and the second one needs 5. But the
1450 the first one will be aligned also by peeling 5 scalar
1451 iterations, and in that case both accesses will be aligned.
1452 Hence, except for the immediate peeling amount, we also want
1453 to try to add full vector size, while we don't exceed
1454 vectorization factor.
1455 We do this automtically for cost model, since we calculate cost
1456 for every peeling option. */
1457 if (!flag_vect_cost_model)
1458 possible_npeel_number = vf /nelements;
1459
1460 /* Handle the aligned case. We may decide to align some other
1461 access, making DR unaligned. */
1462 if (DR_MISALIGNMENT (dr) == 0)
1463 {
1464 npeel_tmp = 0;
1465 if (!flag_vect_cost_model)
1466 possible_npeel_number++;
1467 }
1468
1469 for (j = 0; j < possible_npeel_number; j++)
1470 {
1471 gcc_assert (npeel_tmp <= vf);
1472 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1473 npeel_tmp += nelements;
1474 }
1475
1476 all_misalignments_unknown = false;
1477 /* Data-ref that was chosen for the case that all the
1478 misalignments are unknown is not relevant anymore, since we
1479 have a data-ref with known alignment. */
1480 dr0 = NULL;
1481 }
1482 else
1483 {
1484 /* If we don't know all the misalignment values, we prefer
1485 peeling for data-ref that has maximum number of data-refs
1486 with the same alignment, unless the target prefers to align
1487 stores over load. */
1488 if (all_misalignments_unknown)
1489 {
1490 if (same_align_drs_max
1491 < STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ()
1492 || !dr0)
1493 {
1494 same_align_drs_max
1495 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1496 dr0 = dr;
1497 }
1498
1499 if (!first_store && DR_IS_WRITE (dr))
1500 first_store = dr;
1501 }
1502
1503 /* If there are both known and unknown misaligned accesses in the
1504 loop, we choose peeling amount according to the known
1505 accesses. */
1506
1507
1508 if (!supportable_dr_alignment)
1509 {
1510 dr0 = dr;
1511 if (!first_store && DR_IS_WRITE (dr))
1512 first_store = dr;
1513 }
1514 }
1515 }
1516 else
1517 {
1518 if (!aligned_access_p (dr))
1519 {
1520 if (dump_enabled_p ())
1521 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1522 "vector alignment may not be reachable");
1523 break;
1524 }
1525 }
1526 }
1527
1528 vect_versioning_for_alias_required
1529 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1530
1531 /* Temporarily, if versioning for alias is required, we disable peeling
1532 until we support peeling and versioning. Often peeling for alignment
1533 will require peeling for loop-bound, which in turn requires that we
1534 know how to adjust the loop ivs after the loop. */
1535 if (vect_versioning_for_alias_required
1536 || !vect_can_advance_ivs_p (loop_vinfo)
1537 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1538 do_peeling = false;
1539
1540 if (do_peeling && all_misalignments_unknown
1541 && vect_supportable_dr_alignment (dr0, false))
1542 {
1543
1544 /* Check if the target requires to prefer stores over loads, i.e., if
1545 misaligned stores are more expensive than misaligned loads (taking
1546 drs with same alignment into account). */
1547 if (first_store && DR_IS_READ (dr0))
1548 {
1549 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1550 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1551 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1552 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1553 stmt_vector_for_cost dummy;
1554 dummy.create (2);
1555
1556 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1557 &dummy);
1558 vect_get_data_access_cost (first_store, &store_inside_cost,
1559 &store_outside_cost, &dummy);
1560
1561 dummy.release ();
1562
1563 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1564 aligning the load DR0). */
1565 load_inside_penalty = store_inside_cost;
1566 load_outside_penalty = store_outside_cost;
1567 for (i = 0;
1568 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1569 DR_STMT (first_store))).iterate (i, &dr);
1570 i++)
1571 if (DR_IS_READ (dr))
1572 {
1573 load_inside_penalty += load_inside_cost;
1574 load_outside_penalty += load_outside_cost;
1575 }
1576 else
1577 {
1578 load_inside_penalty += store_inside_cost;
1579 load_outside_penalty += store_outside_cost;
1580 }
1581
1582 /* Calculate the penalty for leaving DR0 unaligned (by
1583 aligning the FIRST_STORE). */
1584 store_inside_penalty = load_inside_cost;
1585 store_outside_penalty = load_outside_cost;
1586 for (i = 0;
1587 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1588 DR_STMT (dr0))).iterate (i, &dr);
1589 i++)
1590 if (DR_IS_READ (dr))
1591 {
1592 store_inside_penalty += load_inside_cost;
1593 store_outside_penalty += load_outside_cost;
1594 }
1595 else
1596 {
1597 store_inside_penalty += store_inside_cost;
1598 store_outside_penalty += store_outside_cost;
1599 }
1600
1601 if (load_inside_penalty > store_inside_penalty
1602 || (load_inside_penalty == store_inside_penalty
1603 && load_outside_penalty > store_outside_penalty))
1604 dr0 = first_store;
1605 }
1606
1607 /* In case there are only loads with different unknown misalignments, use
1608 peeling only if it may help to align other accesses in the loop. */
1609 if (!first_store
1610 && !STMT_VINFO_SAME_ALIGN_REFS (
1611 vinfo_for_stmt (DR_STMT (dr0))).length ()
1612 && vect_supportable_dr_alignment (dr0, false)
1613 != dr_unaligned_supported)
1614 do_peeling = false;
1615 }
1616
1617 if (do_peeling && !dr0)
1618 {
1619 /* Peeling is possible, but there is no data access that is not supported
1620 unless aligned. So we try to choose the best possible peeling. */
1621
1622 /* We should get here only if there are drs with known misalignment. */
1623 gcc_assert (!all_misalignments_unknown);
1624
1625 /* Choose the best peeling from the hash table. */
1626 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1627 &body_cost_vec);
1628 if (!dr0 || !npeel)
1629 do_peeling = false;
1630 }
1631
1632 if (do_peeling)
1633 {
1634 stmt = DR_STMT (dr0);
1635 stmt_info = vinfo_for_stmt (stmt);
1636 vectype = STMT_VINFO_VECTYPE (stmt_info);
1637 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1638
1639 if (known_alignment_for_access_p (dr0))
1640 {
1641 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1642 size_zero_node) < 0;
1643 if (!npeel)
1644 {
1645 /* Since it's known at compile time, compute the number of
1646 iterations in the peeled loop (the peeling factor) for use in
1647 updating DR_MISALIGNMENT values. The peeling factor is the
1648 vectorization factor minus the misalignment as an element
1649 count. */
1650 mis = DR_MISALIGNMENT (dr0);
1651 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1652 npeel = ((negative ? mis - nelements : nelements - mis)
1653 & (nelements - 1));
1654 }
1655
1656 /* For interleaved data access every iteration accesses all the
1657 members of the group, therefore we divide the number of iterations
1658 by the group size. */
1659 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1660 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1661 npeel /= GROUP_SIZE (stmt_info);
1662
1663 if (dump_enabled_p ())
1664 dump_printf_loc (MSG_NOTE, vect_location,
1665 "Try peeling by %d", npeel);
1666 }
1667
1668 /* Ensure that all data refs can be vectorized after the peel. */
1669 FOR_EACH_VEC_ELT (datarefs, i, dr)
1670 {
1671 int save_misalignment;
1672
1673 if (dr == dr0)
1674 continue;
1675
1676 stmt = DR_STMT (dr);
1677 stmt_info = vinfo_for_stmt (stmt);
1678 /* For interleaving, only the alignment of the first access
1679 matters. */
1680 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1681 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1682 continue;
1683
1684 /* Strided loads perform only component accesses, alignment is
1685 irrelevant for them. */
1686 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1687 continue;
1688
1689 save_misalignment = DR_MISALIGNMENT (dr);
1690 vect_update_misalignment_for_peel (dr, dr0, npeel);
1691 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1692 SET_DR_MISALIGNMENT (dr, save_misalignment);
1693
1694 if (!supportable_dr_alignment)
1695 {
1696 do_peeling = false;
1697 break;
1698 }
1699 }
1700
1701 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1702 {
1703 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1704 if (!stat)
1705 do_peeling = false;
1706 else
1707 {
1708 body_cost_vec.release ();
1709 return stat;
1710 }
1711 }
1712
1713 if (do_peeling)
1714 {
1715 stmt_info_for_cost *si;
1716 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1717
1718 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1719 If the misalignment of DR_i is identical to that of dr0 then set
1720 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1721 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1722 by the peeling factor times the element size of DR_i (MOD the
1723 vectorization factor times the size). Otherwise, the
1724 misalignment of DR_i must be set to unknown. */
1725 FOR_EACH_VEC_ELT (datarefs, i, dr)
1726 if (dr != dr0)
1727 vect_update_misalignment_for_peel (dr, dr0, npeel);
1728
1729 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1730 if (npeel)
1731 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1732 else
1733 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1734 SET_DR_MISALIGNMENT (dr0, 0);
1735 if (dump_enabled_p ())
1736 {
1737 dump_printf_loc (MSG_NOTE, vect_location,
1738 "Alignment of access forced using peeling.");
1739 dump_printf_loc (MSG_NOTE, vect_location,
1740 "Peeling for alignment will be applied.");
1741 }
1742 /* We've delayed passing the inside-loop peeling costs to the
1743 target cost model until we were sure peeling would happen.
1744 Do so now. */
1745 if (body_cost_vec.exists ())
1746 {
1747 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1748 {
1749 struct _stmt_vec_info *stmt_info
1750 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1751 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1752 si->misalign, vect_body);
1753 }
1754 body_cost_vec.release ();
1755 }
1756
1757 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1758 gcc_assert (stat);
1759 return stat;
1760 }
1761 }
1762
1763 body_cost_vec.release ();
1764
1765 /* (2) Versioning to force alignment. */
1766
1767 /* Try versioning if:
1768 1) flag_tree_vect_loop_version is TRUE
1769 2) optimize loop for speed
1770 3) there is at least one unsupported misaligned data ref with an unknown
1771 misalignment, and
1772 4) all misaligned data refs with a known misalignment are supported, and
1773 5) the number of runtime alignment checks is within reason. */
1774
1775 do_versioning =
1776 flag_tree_vect_loop_version
1777 && optimize_loop_nest_for_speed_p (loop)
1778 && (!loop->inner); /* FORNOW */
1779
1780 if (do_versioning)
1781 {
1782 FOR_EACH_VEC_ELT (datarefs, i, dr)
1783 {
1784 stmt = DR_STMT (dr);
1785 stmt_info = vinfo_for_stmt (stmt);
1786
1787 /* For interleaving, only the alignment of the first access
1788 matters. */
1789 if (aligned_access_p (dr)
1790 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1791 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1792 continue;
1793
1794 /* Strided loads perform only component accesses, alignment is
1795 irrelevant for them. */
1796 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1797 continue;
1798
1799 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1800
1801 if (!supportable_dr_alignment)
1802 {
1803 gimple stmt;
1804 int mask;
1805 tree vectype;
1806
1807 if (known_alignment_for_access_p (dr)
1808 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1809 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1810 {
1811 do_versioning = false;
1812 break;
1813 }
1814
1815 stmt = DR_STMT (dr);
1816 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1817 gcc_assert (vectype);
1818
1819 /* The rightmost bits of an aligned address must be zeros.
1820 Construct the mask needed for this test. For example,
1821 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1822 mask must be 15 = 0xf. */
1823 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1824
1825 /* FORNOW: use the same mask to test all potentially unaligned
1826 references in the loop. The vectorizer currently supports
1827 a single vector size, see the reference to
1828 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1829 vectorization factor is computed. */
1830 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1831 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1832 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1833 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1834 DR_STMT (dr));
1835 }
1836 }
1837
1838 /* Versioning requires at least one misaligned data reference. */
1839 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1840 do_versioning = false;
1841 else if (!do_versioning)
1842 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1843 }
1844
1845 if (do_versioning)
1846 {
1847 vec<gimple> may_misalign_stmts
1848 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1849 gimple stmt;
1850
1851 /* It can now be assumed that the data references in the statements
1852 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1853 of the loop being vectorized. */
1854 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1855 {
1856 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1857 dr = STMT_VINFO_DATA_REF (stmt_info);
1858 SET_DR_MISALIGNMENT (dr, 0);
1859 if (dump_enabled_p ())
1860 dump_printf_loc (MSG_NOTE, vect_location,
1861 "Alignment of access forced using versioning.");
1862 }
1863
1864 if (dump_enabled_p ())
1865 dump_printf_loc (MSG_NOTE, vect_location,
1866 "Versioning for alignment will be applied.");
1867
1868 /* Peeling and versioning can't be done together at this time. */
1869 gcc_assert (! (do_peeling && do_versioning));
1870
1871 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1872 gcc_assert (stat);
1873 return stat;
1874 }
1875
1876 /* This point is reached if neither peeling nor versioning is being done. */
1877 gcc_assert (! (do_peeling || do_versioning));
1878
1879 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1880 return stat;
1881 }
1882
1883
1884 /* Function vect_find_same_alignment_drs.
1885
1886 Update group and alignment relations according to the chosen
1887 vectorization factor. */
1888
1889 static void
1890 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1891 loop_vec_info loop_vinfo)
1892 {
1893 unsigned int i;
1894 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1895 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1896 struct data_reference *dra = DDR_A (ddr);
1897 struct data_reference *drb = DDR_B (ddr);
1898 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1899 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1900 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1901 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1902 lambda_vector dist_v;
1903 unsigned int loop_depth;
1904
1905 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1906 return;
1907
1908 if (dra == drb)
1909 return;
1910
1911 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1912 return;
1913
1914 /* Loop-based vectorization and known data dependence. */
1915 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1916 return;
1917
1918 /* Data-dependence analysis reports a distance vector of zero
1919 for data-references that overlap only in the first iteration
1920 but have different sign step (see PR45764).
1921 So as a sanity check require equal DR_STEP. */
1922 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1923 return;
1924
1925 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1926 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1927 {
1928 int dist = dist_v[loop_depth];
1929
1930 if (dump_enabled_p ())
1931 dump_printf_loc (MSG_NOTE, vect_location,
1932 "dependence distance = %d.", dist);
1933
1934 /* Same loop iteration. */
1935 if (dist == 0
1936 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1937 {
1938 /* Two references with distance zero have the same alignment. */
1939 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1940 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1941 if (dump_enabled_p ())
1942 {
1943 dump_printf_loc (MSG_NOTE, vect_location,
1944 "accesses have the same alignment.");
1945 dump_printf (MSG_NOTE,
1946 "dependence distance modulo vf == 0 between ");
1947 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1948 dump_printf (MSG_NOTE, " and ");
1949 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1950 }
1951 }
1952 }
1953 }
1954
1955
1956 /* Function vect_analyze_data_refs_alignment
1957
1958 Analyze the alignment of the data-references in the loop.
1959 Return FALSE if a data reference is found that cannot be vectorized. */
1960
1961 bool
1962 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1963 bb_vec_info bb_vinfo)
1964 {
1965 if (dump_enabled_p ())
1966 dump_printf_loc (MSG_NOTE, vect_location,
1967 "=== vect_analyze_data_refs_alignment ===");
1968
1969 /* Mark groups of data references with same alignment using
1970 data dependence information. */
1971 if (loop_vinfo)
1972 {
1973 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1974 struct data_dependence_relation *ddr;
1975 unsigned int i;
1976
1977 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1978 vect_find_same_alignment_drs (ddr, loop_vinfo);
1979 }
1980
1981 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1982 {
1983 if (dump_enabled_p ())
1984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985 "not vectorized: can't calculate alignment "
1986 "for data ref.");
1987 return false;
1988 }
1989
1990 return true;
1991 }
1992
1993
1994 /* Analyze groups of accesses: check that DR belongs to a group of
1995 accesses of legal size, step, etc. Detect gaps, single element
1996 interleaving, and other special cases. Set grouped access info.
1997 Collect groups of strided stores for further use in SLP analysis. */
1998
1999 static bool
2000 vect_analyze_group_access (struct data_reference *dr)
2001 {
2002 tree step = DR_STEP (dr);
2003 tree scalar_type = TREE_TYPE (DR_REF (dr));
2004 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2005 gimple stmt = DR_STMT (dr);
2006 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2007 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2008 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2009 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2010 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2011 bool slp_impossible = false;
2012 struct loop *loop = NULL;
2013
2014 if (loop_vinfo)
2015 loop = LOOP_VINFO_LOOP (loop_vinfo);
2016
2017 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2018 size of the interleaving group (including gaps). */
2019 groupsize = dr_step / type_size;
2020
2021 /* Not consecutive access is possible only if it is a part of interleaving. */
2022 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2023 {
2024 /* Check if it this DR is a part of interleaving, and is a single
2025 element of the group that is accessed in the loop. */
2026
2027 /* Gaps are supported only for loads. STEP must be a multiple of the type
2028 size. The size of the group must be a power of 2. */
2029 if (DR_IS_READ (dr)
2030 && (dr_step % type_size) == 0
2031 && groupsize > 0
2032 && exact_log2 (groupsize) != -1)
2033 {
2034 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2035 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2036 if (dump_enabled_p ())
2037 {
2038 dump_printf_loc (MSG_NOTE, vect_location,
2039 "Detected single element interleaving ");
2040 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2041 dump_printf (MSG_NOTE, " step ");
2042 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2043 }
2044
2045 if (loop_vinfo)
2046 {
2047 if (dump_enabled_p ())
2048 dump_printf_loc (MSG_NOTE, vect_location,
2049 "Data access with gaps requires scalar "
2050 "epilogue loop");
2051 if (loop->inner)
2052 {
2053 if (dump_enabled_p ())
2054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2055 "Peeling for outer loop is not"
2056 " supported");
2057 return false;
2058 }
2059
2060 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2061 }
2062
2063 return true;
2064 }
2065
2066 if (dump_enabled_p ())
2067 {
2068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2069 "not consecutive access ");
2070 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2071 }
2072
2073 if (bb_vinfo)
2074 {
2075 /* Mark the statement as unvectorizable. */
2076 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2077 return true;
2078 }
2079
2080 return false;
2081 }
2082
2083 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2084 {
2085 /* First stmt in the interleaving chain. Check the chain. */
2086 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2087 struct data_reference *data_ref = dr;
2088 unsigned int count = 1;
2089 tree next_step;
2090 tree prev_init = DR_INIT (data_ref);
2091 gimple prev = stmt;
2092 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2093
2094 while (next)
2095 {
2096 /* Skip same data-refs. In case that two or more stmts share
2097 data-ref (supported only for loads), we vectorize only the first
2098 stmt, and the rest get their vectorized loads from the first
2099 one. */
2100 if (!tree_int_cst_compare (DR_INIT (data_ref),
2101 DR_INIT (STMT_VINFO_DATA_REF (
2102 vinfo_for_stmt (next)))))
2103 {
2104 if (DR_IS_WRITE (data_ref))
2105 {
2106 if (dump_enabled_p ())
2107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108 "Two store stmts share the same dr.");
2109 return false;
2110 }
2111
2112 /* Check that there is no load-store dependencies for this loads
2113 to prevent a case of load-store-load to the same location. */
2114 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2115 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2116 {
2117 if (dump_enabled_p ())
2118 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119 "READ_WRITE dependence in interleaving.");
2120 return false;
2121 }
2122
2123 /* For load use the same data-ref load. */
2124 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2125
2126 prev = next;
2127 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2128 continue;
2129 }
2130
2131 prev = next;
2132
2133 /* Check that all the accesses have the same STEP. */
2134 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2135 if (tree_int_cst_compare (step, next_step))
2136 {
2137 if (dump_enabled_p ())
2138 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2139 "not consecutive access in interleaving");
2140 return false;
2141 }
2142
2143 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2144 /* Check that the distance between two accesses is equal to the type
2145 size. Otherwise, we have gaps. */
2146 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2147 - TREE_INT_CST_LOW (prev_init)) / type_size;
2148 if (diff != 1)
2149 {
2150 /* FORNOW: SLP of accesses with gaps is not supported. */
2151 slp_impossible = true;
2152 if (DR_IS_WRITE (data_ref))
2153 {
2154 if (dump_enabled_p ())
2155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156 "interleaved store with gaps");
2157 return false;
2158 }
2159
2160 gaps += diff - 1;
2161 }
2162
2163 last_accessed_element += diff;
2164
2165 /* Store the gap from the previous member of the group. If there is no
2166 gap in the access, GROUP_GAP is always 1. */
2167 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2168
2169 prev_init = DR_INIT (data_ref);
2170 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2171 /* Count the number of data-refs in the chain. */
2172 count++;
2173 }
2174
2175 /* COUNT is the number of accesses found, we multiply it by the size of
2176 the type to get COUNT_IN_BYTES. */
2177 count_in_bytes = type_size * count;
2178
2179 /* Check that the size of the interleaving (including gaps) is not
2180 greater than STEP. */
2181 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2182 {
2183 if (dump_enabled_p ())
2184 {
2185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2186 "interleaving size is greater than step for ");
2187 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2188 }
2189 return false;
2190 }
2191
2192 /* Check that the size of the interleaving is equal to STEP for stores,
2193 i.e., that there are no gaps. */
2194 if (dr_step && dr_step != count_in_bytes)
2195 {
2196 if (DR_IS_READ (dr))
2197 {
2198 slp_impossible = true;
2199 /* There is a gap after the last load in the group. This gap is a
2200 difference between the groupsize and the number of elements.
2201 When there is no gap, this difference should be 0. */
2202 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2203 }
2204 else
2205 {
2206 if (dump_enabled_p ())
2207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208 "interleaved store with gaps");
2209 return false;
2210 }
2211 }
2212
2213 /* Check that STEP is a multiple of type size. */
2214 if (dr_step && (dr_step % type_size) != 0)
2215 {
2216 if (dump_enabled_p ())
2217 {
2218 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219 "step is not a multiple of type size: step ");
2220 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2221 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2222 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2223 TYPE_SIZE_UNIT (scalar_type));
2224 }
2225 return false;
2226 }
2227
2228 if (groupsize == 0)
2229 groupsize = count;
2230
2231 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2232 if (dump_enabled_p ())
2233 dump_printf_loc (MSG_NOTE, vect_location,
2234 "Detected interleaving of size %d", (int)groupsize);
2235
2236 /* SLP: create an SLP data structure for every interleaving group of
2237 stores for further analysis in vect_analyse_slp. */
2238 if (DR_IS_WRITE (dr) && !slp_impossible)
2239 {
2240 if (loop_vinfo)
2241 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2242 if (bb_vinfo)
2243 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2244 }
2245
2246 /* There is a gap in the end of the group. */
2247 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2248 {
2249 if (dump_enabled_p ())
2250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251 "Data access with gaps requires scalar "
2252 "epilogue loop");
2253 if (loop->inner)
2254 {
2255 if (dump_enabled_p ())
2256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2257 "Peeling for outer loop is not supported");
2258 return false;
2259 }
2260
2261 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2262 }
2263 }
2264
2265 return true;
2266 }
2267
2268
2269 /* Analyze the access pattern of the data-reference DR.
2270 In case of non-consecutive accesses call vect_analyze_group_access() to
2271 analyze groups of accesses. */
2272
2273 static bool
2274 vect_analyze_data_ref_access (struct data_reference *dr)
2275 {
2276 tree step = DR_STEP (dr);
2277 tree scalar_type = TREE_TYPE (DR_REF (dr));
2278 gimple stmt = DR_STMT (dr);
2279 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2280 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2281 struct loop *loop = NULL;
2282
2283 if (loop_vinfo)
2284 loop = LOOP_VINFO_LOOP (loop_vinfo);
2285
2286 if (loop_vinfo && !step)
2287 {
2288 if (dump_enabled_p ())
2289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2290 "bad data-ref access in loop");
2291 return false;
2292 }
2293
2294 /* Allow invariant loads in loops. */
2295 if (loop_vinfo && integer_zerop (step))
2296 {
2297 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2298 return DR_IS_READ (dr);
2299 }
2300
2301 if (loop && nested_in_vect_loop_p (loop, stmt))
2302 {
2303 /* Interleaved accesses are not yet supported within outer-loop
2304 vectorization for references in the inner-loop. */
2305 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2306
2307 /* For the rest of the analysis we use the outer-loop step. */
2308 step = STMT_VINFO_DR_STEP (stmt_info);
2309 if (integer_zerop (step))
2310 {
2311 if (dump_enabled_p ())
2312 dump_printf_loc (MSG_NOTE, vect_location,
2313 "zero step in outer loop.");
2314 if (DR_IS_READ (dr))
2315 return true;
2316 else
2317 return false;
2318 }
2319 }
2320
2321 /* Consecutive? */
2322 if (TREE_CODE (step) == INTEGER_CST)
2323 {
2324 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2325 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2326 || (dr_step < 0
2327 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2328 {
2329 /* Mark that it is not interleaving. */
2330 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2331 return true;
2332 }
2333 }
2334
2335 if (loop && nested_in_vect_loop_p (loop, stmt))
2336 {
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_NOTE, vect_location,
2339 "grouped access in outer loop.");
2340 return false;
2341 }
2342
2343 /* Assume this is a DR handled by non-constant strided load case. */
2344 if (TREE_CODE (step) != INTEGER_CST)
2345 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2346
2347 /* Not consecutive access - check if it's a part of interleaving group. */
2348 return vect_analyze_group_access (dr);
2349 }
2350
2351 /* Compare two data-references DRA and DRB to group them into chunks
2352 suitable for grouping. */
2353
2354 static int
2355 dr_group_sort_cmp (const void *dra_, const void *drb_)
2356 {
2357 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2358 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2359 hashval_t h1, h2;
2360 int cmp;
2361
2362 /* Stabilize sort. */
2363 if (dra == drb)
2364 return 0;
2365
2366 /* Ordering of DRs according to base. */
2367 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2368 {
2369 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2370 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2371 if (h1 != h2)
2372 return h1 < h2 ? -1 : 1;
2373 }
2374
2375 /* And according to DR_OFFSET. */
2376 if (!dr_equal_offsets_p (dra, drb))
2377 {
2378 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2379 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2380 if (h1 != h2)
2381 return h1 < h2 ? -1 : 1;
2382 }
2383
2384 /* Put reads before writes. */
2385 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2386 return DR_IS_READ (dra) ? -1 : 1;
2387
2388 /* Then sort after access size. */
2389 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2390 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2391 {
2392 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2393 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2394 if (h1 != h2)
2395 return h1 < h2 ? -1 : 1;
2396 }
2397
2398 /* And after step. */
2399 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2400 {
2401 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2402 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2403 if (h1 != h2)
2404 return h1 < h2 ? -1 : 1;
2405 }
2406
2407 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2408 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2409 if (cmp == 0)
2410 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2411 return cmp;
2412 }
2413
2414 /* Function vect_analyze_data_ref_accesses.
2415
2416 Analyze the access pattern of all the data references in the loop.
2417
2418 FORNOW: the only access pattern that is considered vectorizable is a
2419 simple step 1 (consecutive) access.
2420
2421 FORNOW: handle only arrays and pointer accesses. */
2422
2423 bool
2424 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2425 {
2426 unsigned int i;
2427 vec<data_reference_p> datarefs;
2428 struct data_reference *dr;
2429
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "=== vect_analyze_data_ref_accesses ===");
2433
2434 if (loop_vinfo)
2435 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2436 else
2437 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2438
2439 if (datarefs.is_empty ())
2440 return true;
2441
2442 /* Sort the array of datarefs to make building the interleaving chains
2443 linear. */
2444 qsort (datarefs.address(), datarefs.length (),
2445 sizeof (data_reference_p), dr_group_sort_cmp);
2446
2447 /* Build the interleaving chains. */
2448 for (i = 0; i < datarefs.length () - 1;)
2449 {
2450 data_reference_p dra = datarefs[i];
2451 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2452 stmt_vec_info lastinfo = NULL;
2453 for (i = i + 1; i < datarefs.length (); ++i)
2454 {
2455 data_reference_p drb = datarefs[i];
2456 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2457
2458 /* ??? Imperfect sorting (non-compatible types, non-modulo
2459 accesses, same accesses) can lead to a group to be artificially
2460 split here as we don't just skip over those. If it really
2461 matters we can push those to a worklist and re-iterate
2462 over them. The we can just skip ahead to the next DR here. */
2463
2464 /* Check that the data-refs have same first location (except init)
2465 and they are both either store or load (not load and store). */
2466 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2467 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2468 DR_BASE_ADDRESS (drb), 0)
2469 || !dr_equal_offsets_p (dra, drb))
2470 break;
2471
2472 /* Check that the data-refs have the same constant size and step. */
2473 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2474 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2475 if (!host_integerp (sza, 1)
2476 || !host_integerp (szb, 1)
2477 || !tree_int_cst_equal (sza, szb)
2478 || !host_integerp (DR_STEP (dra), 0)
2479 || !host_integerp (DR_STEP (drb), 0)
2480 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2481 break;
2482
2483 /* Do not place the same access in the interleaving chain twice. */
2484 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2485 break;
2486
2487 /* Check the types are compatible.
2488 ??? We don't distinguish this during sorting. */
2489 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2490 TREE_TYPE (DR_REF (drb))))
2491 break;
2492
2493 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2494 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2495 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2496 gcc_assert (init_a < init_b);
2497
2498 /* If init_b == init_a + the size of the type * k, we have an
2499 interleaving, and DRA is accessed before DRB. */
2500 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2501 if ((init_b - init_a) % type_size_a != 0)
2502 break;
2503
2504 /* The step (if not zero) is greater than the difference between
2505 data-refs' inits. This splits groups into suitable sizes. */
2506 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2507 if (step != 0 && step <= (init_b - init_a))
2508 break;
2509
2510 if (dump_enabled_p ())
2511 {
2512 dump_printf_loc (MSG_NOTE, vect_location,
2513 "Detected interleaving ");
2514 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2515 dump_printf (MSG_NOTE, " and ");
2516 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2517 }
2518
2519 /* Link the found element into the group list. */
2520 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2521 {
2522 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2523 lastinfo = stmtinfo_a;
2524 }
2525 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2526 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2527 lastinfo = stmtinfo_b;
2528 }
2529 }
2530
2531 FOR_EACH_VEC_ELT (datarefs, i, dr)
2532 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2533 && !vect_analyze_data_ref_access (dr))
2534 {
2535 if (dump_enabled_p ())
2536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2537 "not vectorized: complicated access pattern.");
2538
2539 if (bb_vinfo)
2540 {
2541 /* Mark the statement as not vectorizable. */
2542 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2543 continue;
2544 }
2545 else
2546 return false;
2547 }
2548
2549 return true;
2550 }
2551
2552 /* Function vect_prune_runtime_alias_test_list.
2553
2554 Prune a list of ddrs to be tested at run-time by versioning for alias.
2555 Return FALSE if resulting list of ddrs is longer then allowed by
2556 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2557
2558 bool
2559 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2560 {
2561 vec<ddr_p> ddrs =
2562 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2563 unsigned i, j;
2564
2565 if (dump_enabled_p ())
2566 dump_printf_loc (MSG_NOTE, vect_location,
2567 "=== vect_prune_runtime_alias_test_list ===");
2568
2569 for (i = 0; i < ddrs.length (); )
2570 {
2571 bool found;
2572 ddr_p ddr_i;
2573
2574 ddr_i = ddrs[i];
2575 found = false;
2576
2577 for (j = 0; j < i; j++)
2578 {
2579 ddr_p ddr_j = ddrs[j];
2580
2581 if (vect_vfa_range_equal (ddr_i, ddr_j))
2582 {
2583 if (dump_enabled_p ())
2584 {
2585 dump_printf_loc (MSG_NOTE, vect_location,
2586 "found equal ranges ");
2587 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2588 dump_printf (MSG_NOTE, ", ");
2589 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2590 dump_printf (MSG_NOTE, " and ");
2591 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2592 dump_printf (MSG_NOTE, ", ");
2593 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2594 }
2595 found = true;
2596 break;
2597 }
2598 }
2599
2600 if (found)
2601 {
2602 ddrs.ordered_remove (i);
2603 continue;
2604 }
2605 i++;
2606 }
2607
2608 if (ddrs.length () >
2609 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2610 {
2611 if (dump_enabled_p ())
2612 {
2613 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614 "disable versioning for alias - max number of "
2615 "generated checks exceeded.");
2616 }
2617
2618 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2619
2620 return false;
2621 }
2622
2623 return true;
2624 }
2625
2626 /* Check whether a non-affine read in stmt is suitable for gather load
2627 and if so, return a builtin decl for that operation. */
2628
2629 tree
2630 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2631 tree *offp, int *scalep)
2632 {
2633 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2634 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2635 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2636 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2637 tree offtype = NULL_TREE;
2638 tree decl, base, off;
2639 enum machine_mode pmode;
2640 int punsignedp, pvolatilep;
2641
2642 /* The gather builtins need address of the form
2643 loop_invariant + vector * {1, 2, 4, 8}
2644 or
2645 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2646 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2647 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2648 multiplications and additions in it. To get a vector, we need
2649 a single SSA_NAME that will be defined in the loop and will
2650 contain everything that is not loop invariant and that can be
2651 vectorized. The following code attempts to find such a preexistng
2652 SSA_NAME OFF and put the loop invariants into a tree BASE
2653 that can be gimplified before the loop. */
2654 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2655 &pmode, &punsignedp, &pvolatilep, false);
2656 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2657
2658 if (TREE_CODE (base) == MEM_REF)
2659 {
2660 if (!integer_zerop (TREE_OPERAND (base, 1)))
2661 {
2662 if (off == NULL_TREE)
2663 {
2664 double_int moff = mem_ref_offset (base);
2665 off = double_int_to_tree (sizetype, moff);
2666 }
2667 else
2668 off = size_binop (PLUS_EXPR, off,
2669 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2670 }
2671 base = TREE_OPERAND (base, 0);
2672 }
2673 else
2674 base = build_fold_addr_expr (base);
2675
2676 if (off == NULL_TREE)
2677 off = size_zero_node;
2678
2679 /* If base is not loop invariant, either off is 0, then we start with just
2680 the constant offset in the loop invariant BASE and continue with base
2681 as OFF, otherwise give up.
2682 We could handle that case by gimplifying the addition of base + off
2683 into some SSA_NAME and use that as off, but for now punt. */
2684 if (!expr_invariant_in_loop_p (loop, base))
2685 {
2686 if (!integer_zerop (off))
2687 return NULL_TREE;
2688 off = base;
2689 base = size_int (pbitpos / BITS_PER_UNIT);
2690 }
2691 /* Otherwise put base + constant offset into the loop invariant BASE
2692 and continue with OFF. */
2693 else
2694 {
2695 base = fold_convert (sizetype, base);
2696 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2697 }
2698
2699 /* OFF at this point may be either a SSA_NAME or some tree expression
2700 from get_inner_reference. Try to peel off loop invariants from it
2701 into BASE as long as possible. */
2702 STRIP_NOPS (off);
2703 while (offtype == NULL_TREE)
2704 {
2705 enum tree_code code;
2706 tree op0, op1, add = NULL_TREE;
2707
2708 if (TREE_CODE (off) == SSA_NAME)
2709 {
2710 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2711
2712 if (expr_invariant_in_loop_p (loop, off))
2713 return NULL_TREE;
2714
2715 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2716 break;
2717
2718 op0 = gimple_assign_rhs1 (def_stmt);
2719 code = gimple_assign_rhs_code (def_stmt);
2720 op1 = gimple_assign_rhs2 (def_stmt);
2721 }
2722 else
2723 {
2724 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2725 return NULL_TREE;
2726 code = TREE_CODE (off);
2727 extract_ops_from_tree (off, &code, &op0, &op1);
2728 }
2729 switch (code)
2730 {
2731 case POINTER_PLUS_EXPR:
2732 case PLUS_EXPR:
2733 if (expr_invariant_in_loop_p (loop, op0))
2734 {
2735 add = op0;
2736 off = op1;
2737 do_add:
2738 add = fold_convert (sizetype, add);
2739 if (scale != 1)
2740 add = size_binop (MULT_EXPR, add, size_int (scale));
2741 base = size_binop (PLUS_EXPR, base, add);
2742 continue;
2743 }
2744 if (expr_invariant_in_loop_p (loop, op1))
2745 {
2746 add = op1;
2747 off = op0;
2748 goto do_add;
2749 }
2750 break;
2751 case MINUS_EXPR:
2752 if (expr_invariant_in_loop_p (loop, op1))
2753 {
2754 add = fold_convert (sizetype, op1);
2755 add = size_binop (MINUS_EXPR, size_zero_node, add);
2756 off = op0;
2757 goto do_add;
2758 }
2759 break;
2760 case MULT_EXPR:
2761 if (scale == 1 && host_integerp (op1, 0))
2762 {
2763 scale = tree_low_cst (op1, 0);
2764 off = op0;
2765 continue;
2766 }
2767 break;
2768 case SSA_NAME:
2769 off = op0;
2770 continue;
2771 CASE_CONVERT:
2772 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2773 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2774 break;
2775 if (TYPE_PRECISION (TREE_TYPE (op0))
2776 == TYPE_PRECISION (TREE_TYPE (off)))
2777 {
2778 off = op0;
2779 continue;
2780 }
2781 if (TYPE_PRECISION (TREE_TYPE (op0))
2782 < TYPE_PRECISION (TREE_TYPE (off)))
2783 {
2784 off = op0;
2785 offtype = TREE_TYPE (off);
2786 STRIP_NOPS (off);
2787 continue;
2788 }
2789 break;
2790 default:
2791 break;
2792 }
2793 break;
2794 }
2795
2796 /* If at the end OFF still isn't a SSA_NAME or isn't
2797 defined in the loop, punt. */
2798 if (TREE_CODE (off) != SSA_NAME
2799 || expr_invariant_in_loop_p (loop, off))
2800 return NULL_TREE;
2801
2802 if (offtype == NULL_TREE)
2803 offtype = TREE_TYPE (off);
2804
2805 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2806 offtype, scale);
2807 if (decl == NULL_TREE)
2808 return NULL_TREE;
2809
2810 if (basep)
2811 *basep = base;
2812 if (offp)
2813 *offp = off;
2814 if (scalep)
2815 *scalep = scale;
2816 return decl;
2817 }
2818
2819 /* Function vect_analyze_data_refs.
2820
2821 Find all the data references in the loop or basic block.
2822
2823 The general structure of the analysis of data refs in the vectorizer is as
2824 follows:
2825 1- vect_analyze_data_refs(loop/bb): call
2826 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2827 in the loop/bb and their dependences.
2828 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2829 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2830 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2831
2832 */
2833
2834 bool
2835 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2836 bb_vec_info bb_vinfo,
2837 int *min_vf)
2838 {
2839 struct loop *loop = NULL;
2840 basic_block bb = NULL;
2841 unsigned int i;
2842 vec<data_reference_p> datarefs;
2843 struct data_reference *dr;
2844 tree scalar_type;
2845
2846 if (dump_enabled_p ())
2847 dump_printf_loc (MSG_NOTE, vect_location,
2848 "=== vect_analyze_data_refs ===\n");
2849
2850 if (loop_vinfo)
2851 {
2852 loop = LOOP_VINFO_LOOP (loop_vinfo);
2853 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2854 || find_data_references_in_loop
2855 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2856 {
2857 if (dump_enabled_p ())
2858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2859 "not vectorized: loop contains function calls"
2860 " or data references that cannot be analyzed");
2861 return false;
2862 }
2863
2864 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2865 }
2866 else
2867 {
2868 gimple_stmt_iterator gsi;
2869
2870 bb = BB_VINFO_BB (bb_vinfo);
2871 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2872 {
2873 gimple stmt = gsi_stmt (gsi);
2874 if (!find_data_references_in_stmt (NULL, stmt,
2875 &BB_VINFO_DATAREFS (bb_vinfo)))
2876 {
2877 /* Mark the rest of the basic-block as unvectorizable. */
2878 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2879 {
2880 stmt = gsi_stmt (gsi);
2881 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2882 }
2883 break;
2884 }
2885 }
2886
2887 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2888 }
2889
2890 /* Go through the data-refs, check that the analysis succeeded. Update
2891 pointer from stmt_vec_info struct to DR and vectype. */
2892
2893 FOR_EACH_VEC_ELT (datarefs, i, dr)
2894 {
2895 gimple stmt;
2896 stmt_vec_info stmt_info;
2897 tree base, offset, init;
2898 bool gather = false;
2899 int vf;
2900
2901 if (!dr || !DR_REF (dr))
2902 {
2903 if (dump_enabled_p ())
2904 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2905 "not vectorized: unhandled data-ref ");
2906 return false;
2907 }
2908
2909 stmt = DR_STMT (dr);
2910 stmt_info = vinfo_for_stmt (stmt);
2911
2912 /* Check that analysis of the data-ref succeeded. */
2913 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2914 || !DR_STEP (dr))
2915 {
2916 /* If target supports vector gather loads, see if they can't
2917 be used. */
2918 if (loop_vinfo
2919 && DR_IS_READ (dr)
2920 && !TREE_THIS_VOLATILE (DR_REF (dr))
2921 && targetm.vectorize.builtin_gather != NULL
2922 && !nested_in_vect_loop_p (loop, stmt))
2923 {
2924 struct data_reference *newdr
2925 = create_data_ref (NULL, loop_containing_stmt (stmt),
2926 DR_REF (dr), stmt, true);
2927 gcc_assert (newdr != NULL && DR_REF (newdr));
2928 if (DR_BASE_ADDRESS (newdr)
2929 && DR_OFFSET (newdr)
2930 && DR_INIT (newdr)
2931 && DR_STEP (newdr)
2932 && integer_zerop (DR_STEP (newdr)))
2933 {
2934 dr = newdr;
2935 gather = true;
2936 }
2937 else
2938 free_data_ref (newdr);
2939 }
2940
2941 if (!gather)
2942 {
2943 if (dump_enabled_p ())
2944 {
2945 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2946 "not vectorized: data ref analysis "
2947 "failed ");
2948 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2949 }
2950
2951 if (bb_vinfo)
2952 break;
2953
2954 return false;
2955 }
2956 }
2957
2958 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2959 {
2960 if (dump_enabled_p ())
2961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2962 "not vectorized: base addr of dr is a "
2963 "constant");
2964
2965 if (bb_vinfo)
2966 break;
2967
2968 if (gather)
2969 free_data_ref (dr);
2970 return false;
2971 }
2972
2973 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2974 {
2975 if (dump_enabled_p ())
2976 {
2977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2978 "not vectorized: volatile type ");
2979 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2980 }
2981
2982 if (bb_vinfo)
2983 break;
2984
2985 return false;
2986 }
2987
2988 if (stmt_can_throw_internal (stmt))
2989 {
2990 if (dump_enabled_p ())
2991 {
2992 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2993 "not vectorized: statement can throw an "
2994 "exception ");
2995 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2996 }
2997
2998 if (bb_vinfo)
2999 break;
3000
3001 if (gather)
3002 free_data_ref (dr);
3003 return false;
3004 }
3005
3006 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3007 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3008 {
3009 if (dump_enabled_p ())
3010 {
3011 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012 "not vectorized: statement is bitfield "
3013 "access ");
3014 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3015 }
3016
3017 if (bb_vinfo)
3018 break;
3019
3020 if (gather)
3021 free_data_ref (dr);
3022 return false;
3023 }
3024
3025 base = unshare_expr (DR_BASE_ADDRESS (dr));
3026 offset = unshare_expr (DR_OFFSET (dr));
3027 init = unshare_expr (DR_INIT (dr));
3028
3029 if (is_gimple_call (stmt))
3030 {
3031 if (dump_enabled_p ())
3032 {
3033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3034 "not vectorized: dr in a call ");
3035 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3036 }
3037
3038 if (bb_vinfo)
3039 break;
3040
3041 if (gather)
3042 free_data_ref (dr);
3043 return false;
3044 }
3045
3046 /* Update DR field in stmt_vec_info struct. */
3047
3048 /* If the dataref is in an inner-loop of the loop that is considered for
3049 for vectorization, we also want to analyze the access relative to
3050 the outer-loop (DR contains information only relative to the
3051 inner-most enclosing loop). We do that by building a reference to the
3052 first location accessed by the inner-loop, and analyze it relative to
3053 the outer-loop. */
3054 if (loop && nested_in_vect_loop_p (loop, stmt))
3055 {
3056 tree outer_step, outer_base, outer_init;
3057 HOST_WIDE_INT pbitsize, pbitpos;
3058 tree poffset;
3059 enum machine_mode pmode;
3060 int punsignedp, pvolatilep;
3061 affine_iv base_iv, offset_iv;
3062 tree dinit;
3063
3064 /* Build a reference to the first location accessed by the
3065 inner-loop: *(BASE+INIT). (The first location is actually
3066 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3067 tree inner_base = build_fold_indirect_ref
3068 (fold_build_pointer_plus (base, init));
3069
3070 if (dump_enabled_p ())
3071 {
3072 dump_printf_loc (MSG_NOTE, vect_location,
3073 "analyze in outer-loop: ");
3074 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3075 }
3076
3077 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3078 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3079 gcc_assert (outer_base != NULL_TREE);
3080
3081 if (pbitpos % BITS_PER_UNIT != 0)
3082 {
3083 if (dump_enabled_p ())
3084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3085 "failed: bit offset alignment.\n");
3086 return false;
3087 }
3088
3089 outer_base = build_fold_addr_expr (outer_base);
3090 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3091 &base_iv, false))
3092 {
3093 if (dump_enabled_p ())
3094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3095 "failed: evolution of base is not affine.\n");
3096 return false;
3097 }
3098
3099 if (offset)
3100 {
3101 if (poffset)
3102 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3103 poffset);
3104 else
3105 poffset = offset;
3106 }
3107
3108 if (!poffset)
3109 {
3110 offset_iv.base = ssize_int (0);
3111 offset_iv.step = ssize_int (0);
3112 }
3113 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3114 &offset_iv, false))
3115 {
3116 if (dump_enabled_p ())
3117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3118 "evolution of offset is not affine.\n");
3119 return false;
3120 }
3121
3122 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3123 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3124 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3125 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3126 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3127
3128 outer_step = size_binop (PLUS_EXPR,
3129 fold_convert (ssizetype, base_iv.step),
3130 fold_convert (ssizetype, offset_iv.step));
3131
3132 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3133 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3134 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3135 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3136 STMT_VINFO_DR_OFFSET (stmt_info) =
3137 fold_convert (ssizetype, offset_iv.base);
3138 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3139 size_int (highest_pow2_factor (offset_iv.base));
3140
3141 if (dump_enabled_p ())
3142 {
3143 dump_printf_loc (MSG_NOTE, vect_location,
3144 "\touter base_address: ");
3145 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3146 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3147 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3148 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3149 STMT_VINFO_DR_OFFSET (stmt_info));
3150 dump_printf (MSG_NOTE,
3151 "\n\touter constant offset from base address: ");
3152 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3153 STMT_VINFO_DR_INIT (stmt_info));
3154 dump_printf (MSG_NOTE, "\n\touter step: ");
3155 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3156 STMT_VINFO_DR_STEP (stmt_info));
3157 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3158 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3159 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3160 }
3161 }
3162
3163 if (STMT_VINFO_DATA_REF (stmt_info))
3164 {
3165 if (dump_enabled_p ())
3166 {
3167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3168 "not vectorized: more than one data ref "
3169 "in stmt: ");
3170 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3171 }
3172
3173 if (bb_vinfo)
3174 break;
3175
3176 if (gather)
3177 free_data_ref (dr);
3178 return false;
3179 }
3180
3181 STMT_VINFO_DATA_REF (stmt_info) = dr;
3182
3183 /* Set vectype for STMT. */
3184 scalar_type = TREE_TYPE (DR_REF (dr));
3185 STMT_VINFO_VECTYPE (stmt_info) =
3186 get_vectype_for_scalar_type (scalar_type);
3187 if (!STMT_VINFO_VECTYPE (stmt_info))
3188 {
3189 if (dump_enabled_p ())
3190 {
3191 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3192 "not vectorized: no vectype for stmt: ");
3193 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3194 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3195 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3196 scalar_type);
3197 }
3198
3199 if (bb_vinfo)
3200 break;
3201
3202 if (gather)
3203 {
3204 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3205 free_data_ref (dr);
3206 }
3207 return false;
3208 }
3209 else
3210 {
3211 if (dump_enabled_p ())
3212 {
3213 dump_printf_loc (MSG_NOTE, vect_location,
3214 "got vectype for stmt: ");
3215 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
3216 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3217 STMT_VINFO_VECTYPE (stmt_info));
3218 }
3219 }
3220
3221 /* Adjust the minimal vectorization factor according to the
3222 vector type. */
3223 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3224 if (vf > *min_vf)
3225 *min_vf = vf;
3226
3227 if (gather)
3228 {
3229 tree off;
3230
3231 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3232 if (gather
3233 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3234 gather = false;
3235 if (!gather)
3236 {
3237 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3238 free_data_ref (dr);
3239 if (dump_enabled_p ())
3240 {
3241 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3242 "not vectorized: not suitable for gather "
3243 "load ");
3244 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3245 }
3246 return false;
3247 }
3248
3249 datarefs[i] = dr;
3250 STMT_VINFO_GATHER_P (stmt_info) = true;
3251 }
3252 else if (loop_vinfo
3253 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3254 {
3255 if (nested_in_vect_loop_p (loop, stmt)
3256 || !DR_IS_READ (dr))
3257 {
3258 if (dump_enabled_p ())
3259 {
3260 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3261 "not vectorized: not suitable for strided "
3262 "load ");
3263 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3264 }
3265 return false;
3266 }
3267 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3268 }
3269 }
3270
3271 /* If we stopped analysis at the first dataref we could not analyze
3272 when trying to vectorize a basic-block mark the rest of the datarefs
3273 as not vectorizable and truncate the vector of datarefs. That
3274 avoids spending useless time in analyzing their dependence. */
3275 if (i != datarefs.length ())
3276 {
3277 gcc_assert (bb_vinfo != NULL);
3278 for (unsigned j = i; j < datarefs.length (); ++j)
3279 {
3280 data_reference_p dr = datarefs[j];
3281 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3282 free_data_ref (dr);
3283 }
3284 datarefs.truncate (i);
3285 }
3286
3287 return true;
3288 }
3289
3290
3291 /* Function vect_get_new_vect_var.
3292
3293 Returns a name for a new variable. The current naming scheme appends the
3294 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3295 the name of vectorizer generated variables, and appends that to NAME if
3296 provided. */
3297
3298 tree
3299 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3300 {
3301 const char *prefix;
3302 tree new_vect_var;
3303
3304 switch (var_kind)
3305 {
3306 case vect_simple_var:
3307 prefix = "vect";
3308 break;
3309 case vect_scalar_var:
3310 prefix = "stmp";
3311 break;
3312 case vect_pointer_var:
3313 prefix = "vectp";
3314 break;
3315 default:
3316 gcc_unreachable ();
3317 }
3318
3319 if (name)
3320 {
3321 char* tmp = concat (prefix, "_", name, NULL);
3322 new_vect_var = create_tmp_reg (type, tmp);
3323 free (tmp);
3324 }
3325 else
3326 new_vect_var = create_tmp_reg (type, prefix);
3327
3328 return new_vect_var;
3329 }
3330
3331
3332 /* Function vect_create_addr_base_for_vector_ref.
3333
3334 Create an expression that computes the address of the first memory location
3335 that will be accessed for a data reference.
3336
3337 Input:
3338 STMT: The statement containing the data reference.
3339 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3340 OFFSET: Optional. If supplied, it is be added to the initial address.
3341 LOOP: Specify relative to which loop-nest should the address be computed.
3342 For example, when the dataref is in an inner-loop nested in an
3343 outer-loop that is now being vectorized, LOOP can be either the
3344 outer-loop, or the inner-loop. The first memory location accessed
3345 by the following dataref ('in' points to short):
3346
3347 for (i=0; i<N; i++)
3348 for (j=0; j<M; j++)
3349 s += in[i+j]
3350
3351 is as follows:
3352 if LOOP=i_loop: &in (relative to i_loop)
3353 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3354
3355 Output:
3356 1. Return an SSA_NAME whose value is the address of the memory location of
3357 the first vector of the data reference.
3358 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3359 these statement(s) which define the returned SSA_NAME.
3360
3361 FORNOW: We are only handling array accesses with step 1. */
3362
3363 tree
3364 vect_create_addr_base_for_vector_ref (gimple stmt,
3365 gimple_seq *new_stmt_list,
3366 tree offset,
3367 struct loop *loop)
3368 {
3369 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3370 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3371 tree data_ref_base;
3372 const char *base_name;
3373 tree addr_base;
3374 tree dest;
3375 gimple_seq seq = NULL;
3376 tree base_offset;
3377 tree init;
3378 tree vect_ptr_type;
3379 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3380 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3381
3382 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3383 {
3384 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3385
3386 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3387
3388 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3389 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3390 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3391 }
3392 else
3393 {
3394 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3395 base_offset = unshare_expr (DR_OFFSET (dr));
3396 init = unshare_expr (DR_INIT (dr));
3397 }
3398
3399 if (loop_vinfo)
3400 base_name = get_name (data_ref_base);
3401 else
3402 {
3403 base_offset = ssize_int (0);
3404 init = ssize_int (0);
3405 base_name = get_name (DR_REF (dr));
3406 }
3407
3408 /* Create base_offset */
3409 base_offset = size_binop (PLUS_EXPR,
3410 fold_convert (sizetype, base_offset),
3411 fold_convert (sizetype, init));
3412
3413 if (offset)
3414 {
3415 offset = fold_build2 (MULT_EXPR, sizetype,
3416 fold_convert (sizetype, offset), step);
3417 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3418 base_offset, offset);
3419 }
3420
3421 /* base + base_offset */
3422 if (loop_vinfo)
3423 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3424 else
3425 {
3426 addr_base = build1 (ADDR_EXPR,
3427 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3428 unshare_expr (DR_REF (dr)));
3429 }
3430
3431 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3432 addr_base = fold_convert (vect_ptr_type, addr_base);
3433 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3434 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3435 gimple_seq_add_seq (new_stmt_list, seq);
3436
3437 if (DR_PTR_INFO (dr)
3438 && TREE_CODE (addr_base) == SSA_NAME)
3439 {
3440 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3441 if (offset)
3442 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3443 }
3444
3445 if (dump_enabled_p ())
3446 {
3447 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3448 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3449 }
3450
3451 return addr_base;
3452 }
3453
3454
3455 /* Function vect_create_data_ref_ptr.
3456
3457 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3458 location accessed in the loop by STMT, along with the def-use update
3459 chain to appropriately advance the pointer through the loop iterations.
3460 Also set aliasing information for the pointer. This pointer is used by
3461 the callers to this function to create a memory reference expression for
3462 vector load/store access.
3463
3464 Input:
3465 1. STMT: a stmt that references memory. Expected to be of the form
3466 GIMPLE_ASSIGN <name, data-ref> or
3467 GIMPLE_ASSIGN <data-ref, name>.
3468 2. AGGR_TYPE: the type of the reference, which should be either a vector
3469 or an array.
3470 3. AT_LOOP: the loop where the vector memref is to be created.
3471 4. OFFSET (optional): an offset to be added to the initial address accessed
3472 by the data-ref in STMT.
3473 5. BSI: location where the new stmts are to be placed if there is no loop
3474 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3475 pointing to the initial address.
3476
3477 Output:
3478 1. Declare a new ptr to vector_type, and have it point to the base of the
3479 data reference (initial addressed accessed by the data reference).
3480 For example, for vector of type V8HI, the following code is generated:
3481
3482 v8hi *ap;
3483 ap = (v8hi *)initial_address;
3484
3485 if OFFSET is not supplied:
3486 initial_address = &a[init];
3487 if OFFSET is supplied:
3488 initial_address = &a[init + OFFSET];
3489
3490 Return the initial_address in INITIAL_ADDRESS.
3491
3492 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3493 update the pointer in each iteration of the loop.
3494
3495 Return the increment stmt that updates the pointer in PTR_INCR.
3496
3497 3. Set INV_P to true if the access pattern of the data reference in the
3498 vectorized loop is invariant. Set it to false otherwise.
3499
3500 4. Return the pointer. */
3501
3502 tree
3503 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3504 tree offset, tree *initial_address,
3505 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3506 bool only_init, bool *inv_p)
3507 {
3508 const char *base_name;
3509 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3510 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3511 struct loop *loop = NULL;
3512 bool nested_in_vect_loop = false;
3513 struct loop *containing_loop = NULL;
3514 tree aggr_ptr_type;
3515 tree aggr_ptr;
3516 tree new_temp;
3517 gimple vec_stmt;
3518 gimple_seq new_stmt_list = NULL;
3519 edge pe = NULL;
3520 basic_block new_bb;
3521 tree aggr_ptr_init;
3522 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3523 tree aptr;
3524 gimple_stmt_iterator incr_gsi;
3525 bool insert_after;
3526 bool negative;
3527 tree indx_before_incr, indx_after_incr;
3528 gimple incr;
3529 tree step;
3530 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3531
3532 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3533 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3534
3535 if (loop_vinfo)
3536 {
3537 loop = LOOP_VINFO_LOOP (loop_vinfo);
3538 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3539 containing_loop = (gimple_bb (stmt))->loop_father;
3540 pe = loop_preheader_edge (loop);
3541 }
3542 else
3543 {
3544 gcc_assert (bb_vinfo);
3545 only_init = true;
3546 *ptr_incr = NULL;
3547 }
3548
3549 /* Check the step (evolution) of the load in LOOP, and record
3550 whether it's invariant. */
3551 if (nested_in_vect_loop)
3552 step = STMT_VINFO_DR_STEP (stmt_info);
3553 else
3554 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3555
3556 if (tree_int_cst_compare (step, size_zero_node) == 0)
3557 *inv_p = true;
3558 else
3559 *inv_p = false;
3560 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3561
3562 /* Create an expression for the first address accessed by this load
3563 in LOOP. */
3564 base_name = get_name (DR_BASE_ADDRESS (dr));
3565
3566 if (dump_enabled_p ())
3567 {
3568 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3569 dump_printf_loc (MSG_NOTE, vect_location,
3570 "create %s-pointer variable to type: ",
3571 tree_code_name[(int) TREE_CODE (aggr_type)]);
3572 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3573 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3574 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3575 else if (TREE_CODE (dr_base_type) == VECTOR_TYPE)
3576 dump_printf (MSG_NOTE, " vectorizing a vector ref: ");
3577 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3578 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3579 else
3580 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3581 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3582 }
3583
3584 /* (1) Create the new aggregate-pointer variable.
3585 Vector and array types inherit the alias set of their component
3586 type by default so we need to use a ref-all pointer if the data
3587 reference does not conflict with the created aggregated data
3588 reference because it is not addressable. */
3589 bool need_ref_all = false;
3590 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3591 get_alias_set (DR_REF (dr))))
3592 need_ref_all = true;
3593 /* Likewise for any of the data references in the stmt group. */
3594 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3595 {
3596 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3597 do
3598 {
3599 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3600 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3601 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3602 get_alias_set (DR_REF (sdr))))
3603 {
3604 need_ref_all = true;
3605 break;
3606 }
3607 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3608 }
3609 while (orig_stmt);
3610 }
3611 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3612 need_ref_all);
3613 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3614
3615
3616 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3617 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3618 def-use update cycles for the pointer: one relative to the outer-loop
3619 (LOOP), which is what steps (3) and (4) below do. The other is relative
3620 to the inner-loop (which is the inner-most loop containing the dataref),
3621 and this is done be step (5) below.
3622
3623 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3624 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3625 redundant. Steps (3),(4) create the following:
3626
3627 vp0 = &base_addr;
3628 LOOP: vp1 = phi(vp0,vp2)
3629 ...
3630 ...
3631 vp2 = vp1 + step
3632 goto LOOP
3633
3634 If there is an inner-loop nested in loop, then step (5) will also be
3635 applied, and an additional update in the inner-loop will be created:
3636
3637 vp0 = &base_addr;
3638 LOOP: vp1 = phi(vp0,vp2)
3639 ...
3640 inner: vp3 = phi(vp1,vp4)
3641 vp4 = vp3 + inner_step
3642 if () goto inner
3643 ...
3644 vp2 = vp1 + step
3645 if () goto LOOP */
3646
3647 /* (2) Calculate the initial address of the aggregate-pointer, and set
3648 the aggregate-pointer to point to it before the loop. */
3649
3650 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3651
3652 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3653 offset, loop);
3654 if (new_stmt_list)
3655 {
3656 if (pe)
3657 {
3658 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3659 gcc_assert (!new_bb);
3660 }
3661 else
3662 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3663 }
3664
3665 *initial_address = new_temp;
3666
3667 /* Create: p = (aggr_type *) initial_base */
3668 if (TREE_CODE (new_temp) != SSA_NAME
3669 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3670 {
3671 vec_stmt = gimple_build_assign (aggr_ptr,
3672 fold_convert (aggr_ptr_type, new_temp));
3673 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3674 /* Copy the points-to information if it exists. */
3675 if (DR_PTR_INFO (dr))
3676 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3677 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3678 if (pe)
3679 {
3680 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3681 gcc_assert (!new_bb);
3682 }
3683 else
3684 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3685 }
3686 else
3687 aggr_ptr_init = new_temp;
3688
3689 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3690 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3691 inner-loop nested in LOOP (during outer-loop vectorization). */
3692
3693 /* No update in loop is required. */
3694 if (only_init && (!loop_vinfo || at_loop == loop))
3695 aptr = aggr_ptr_init;
3696 else
3697 {
3698 /* The step of the aggregate pointer is the type size. */
3699 tree step = TYPE_SIZE_UNIT (aggr_type);
3700 /* One exception to the above is when the scalar step of the load in
3701 LOOP is zero. In this case the step here is also zero. */
3702 if (*inv_p)
3703 step = size_zero_node;
3704 else if (negative)
3705 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3706
3707 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3708
3709 create_iv (aggr_ptr_init,
3710 fold_convert (aggr_ptr_type, step),
3711 aggr_ptr, loop, &incr_gsi, insert_after,
3712 &indx_before_incr, &indx_after_incr);
3713 incr = gsi_stmt (incr_gsi);
3714 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3715
3716 /* Copy the points-to information if it exists. */
3717 if (DR_PTR_INFO (dr))
3718 {
3719 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3720 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3721 }
3722 if (ptr_incr)
3723 *ptr_incr = incr;
3724
3725 aptr = indx_before_incr;
3726 }
3727
3728 if (!nested_in_vect_loop || only_init)
3729 return aptr;
3730
3731
3732 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3733 nested in LOOP, if exists. */
3734
3735 gcc_assert (nested_in_vect_loop);
3736 if (!only_init)
3737 {
3738 standard_iv_increment_position (containing_loop, &incr_gsi,
3739 &insert_after);
3740 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3741 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3742 &indx_after_incr);
3743 incr = gsi_stmt (incr_gsi);
3744 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3745
3746 /* Copy the points-to information if it exists. */
3747 if (DR_PTR_INFO (dr))
3748 {
3749 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3750 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3751 }
3752 if (ptr_incr)
3753 *ptr_incr = incr;
3754
3755 return indx_before_incr;
3756 }
3757 else
3758 gcc_unreachable ();
3759 }
3760
3761
3762 /* Function bump_vector_ptr
3763
3764 Increment a pointer (to a vector type) by vector-size. If requested,
3765 i.e. if PTR-INCR is given, then also connect the new increment stmt
3766 to the existing def-use update-chain of the pointer, by modifying
3767 the PTR_INCR as illustrated below:
3768
3769 The pointer def-use update-chain before this function:
3770 DATAREF_PTR = phi (p_0, p_2)
3771 ....
3772 PTR_INCR: p_2 = DATAREF_PTR + step
3773
3774 The pointer def-use update-chain after this function:
3775 DATAREF_PTR = phi (p_0, p_2)
3776 ....
3777 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3778 ....
3779 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3780
3781 Input:
3782 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3783 in the loop.
3784 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3785 the loop. The increment amount across iterations is expected
3786 to be vector_size.
3787 BSI - location where the new update stmt is to be placed.
3788 STMT - the original scalar memory-access stmt that is being vectorized.
3789 BUMP - optional. The offset by which to bump the pointer. If not given,
3790 the offset is assumed to be vector_size.
3791
3792 Output: Return NEW_DATAREF_PTR as illustrated above.
3793
3794 */
3795
3796 tree
3797 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3798 gimple stmt, tree bump)
3799 {
3800 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3801 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3802 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3803 tree update = TYPE_SIZE_UNIT (vectype);
3804 gimple incr_stmt;
3805 ssa_op_iter iter;
3806 use_operand_p use_p;
3807 tree new_dataref_ptr;
3808
3809 if (bump)
3810 update = bump;
3811
3812 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3813 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3814 dataref_ptr, update);
3815 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3816
3817 /* Copy the points-to information if it exists. */
3818 if (DR_PTR_INFO (dr))
3819 {
3820 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3821 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3822 }
3823
3824 if (!ptr_incr)
3825 return new_dataref_ptr;
3826
3827 /* Update the vector-pointer's cross-iteration increment. */
3828 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3829 {
3830 tree use = USE_FROM_PTR (use_p);
3831
3832 if (use == dataref_ptr)
3833 SET_USE (use_p, new_dataref_ptr);
3834 else
3835 gcc_assert (tree_int_cst_compare (use, update) == 0);
3836 }
3837
3838 return new_dataref_ptr;
3839 }
3840
3841
3842 /* Function vect_create_destination_var.
3843
3844 Create a new temporary of type VECTYPE. */
3845
3846 tree
3847 vect_create_destination_var (tree scalar_dest, tree vectype)
3848 {
3849 tree vec_dest;
3850 const char *name;
3851 char *new_name;
3852 tree type;
3853 enum vect_var_kind kind;
3854
3855 kind = vectype ? vect_simple_var : vect_scalar_var;
3856 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3857
3858 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3859
3860 name = get_name (scalar_dest);
3861 if (name)
3862 asprintf (&new_name, "%s_%u", name, SSA_NAME_VERSION (scalar_dest));
3863 else
3864 asprintf (&new_name, "_%u", SSA_NAME_VERSION (scalar_dest));
3865 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3866 free (new_name);
3867
3868 return vec_dest;
3869 }
3870
3871 /* Function vect_grouped_store_supported.
3872
3873 Returns TRUE if interleave high and interleave low permutations
3874 are supported, and FALSE otherwise. */
3875
3876 bool
3877 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3878 {
3879 enum machine_mode mode = TYPE_MODE (vectype);
3880
3881 /* vect_permute_store_chain requires the group size to be a power of two. */
3882 if (exact_log2 (count) == -1)
3883 {
3884 if (dump_enabled_p ())
3885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3886 "the size of the group of accesses"
3887 " is not a power of 2");
3888 return false;
3889 }
3890
3891 /* Check that the permutation is supported. */
3892 if (VECTOR_MODE_P (mode))
3893 {
3894 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3895 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3896 for (i = 0; i < nelt / 2; i++)
3897 {
3898 sel[i * 2] = i;
3899 sel[i * 2 + 1] = i + nelt;
3900 }
3901 if (can_vec_perm_p (mode, false, sel))
3902 {
3903 for (i = 0; i < nelt; i++)
3904 sel[i] += nelt / 2;
3905 if (can_vec_perm_p (mode, false, sel))
3906 return true;
3907 }
3908 }
3909
3910 if (dump_enabled_p ())
3911 dump_printf (MSG_MISSED_OPTIMIZATION,
3912 "interleave op not supported by target.");
3913 return false;
3914 }
3915
3916
3917 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3918 type VECTYPE. */
3919
3920 bool
3921 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3922 {
3923 return vect_lanes_optab_supported_p ("vec_store_lanes",
3924 vec_store_lanes_optab,
3925 vectype, count);
3926 }
3927
3928
3929 /* Function vect_permute_store_chain.
3930
3931 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3932 a power of 2, generate interleave_high/low stmts to reorder the data
3933 correctly for the stores. Return the final references for stores in
3934 RESULT_CHAIN.
3935
3936 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3937 The input is 4 vectors each containing 8 elements. We assign a number to
3938 each element, the input sequence is:
3939
3940 1st vec: 0 1 2 3 4 5 6 7
3941 2nd vec: 8 9 10 11 12 13 14 15
3942 3rd vec: 16 17 18 19 20 21 22 23
3943 4th vec: 24 25 26 27 28 29 30 31
3944
3945 The output sequence should be:
3946
3947 1st vec: 0 8 16 24 1 9 17 25
3948 2nd vec: 2 10 18 26 3 11 19 27
3949 3rd vec: 4 12 20 28 5 13 21 30
3950 4th vec: 6 14 22 30 7 15 23 31
3951
3952 i.e., we interleave the contents of the four vectors in their order.
3953
3954 We use interleave_high/low instructions to create such output. The input of
3955 each interleave_high/low operation is two vectors:
3956 1st vec 2nd vec
3957 0 1 2 3 4 5 6 7
3958 the even elements of the result vector are obtained left-to-right from the
3959 high/low elements of the first vector. The odd elements of the result are
3960 obtained left-to-right from the high/low elements of the second vector.
3961 The output of interleave_high will be: 0 4 1 5
3962 and of interleave_low: 2 6 3 7
3963
3964
3965 The permutation is done in log LENGTH stages. In each stage interleave_high
3966 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3967 where the first argument is taken from the first half of DR_CHAIN and the
3968 second argument from it's second half.
3969 In our example,
3970
3971 I1: interleave_high (1st vec, 3rd vec)
3972 I2: interleave_low (1st vec, 3rd vec)
3973 I3: interleave_high (2nd vec, 4th vec)
3974 I4: interleave_low (2nd vec, 4th vec)
3975
3976 The output for the first stage is:
3977
3978 I1: 0 16 1 17 2 18 3 19
3979 I2: 4 20 5 21 6 22 7 23
3980 I3: 8 24 9 25 10 26 11 27
3981 I4: 12 28 13 29 14 30 15 31
3982
3983 The output of the second stage, i.e. the final result is:
3984
3985 I1: 0 8 16 24 1 9 17 25
3986 I2: 2 10 18 26 3 11 19 27
3987 I3: 4 12 20 28 5 13 21 30
3988 I4: 6 14 22 30 7 15 23 31. */
3989
3990 void
3991 vect_permute_store_chain (vec<tree> dr_chain,
3992 unsigned int length,
3993 gimple stmt,
3994 gimple_stmt_iterator *gsi,
3995 vec<tree> *result_chain)
3996 {
3997 tree vect1, vect2, high, low;
3998 gimple perm_stmt;
3999 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4000 tree perm_mask_low, perm_mask_high;
4001 unsigned int i, n;
4002 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4003 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4004
4005 result_chain->quick_grow (length);
4006 memcpy (result_chain->address (), dr_chain.address (),
4007 length * sizeof (tree));
4008
4009 for (i = 0, n = nelt / 2; i < n; i++)
4010 {
4011 sel[i * 2] = i;
4012 sel[i * 2 + 1] = i + nelt;
4013 }
4014 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4015 gcc_assert (perm_mask_high != NULL);
4016
4017 for (i = 0; i < nelt; i++)
4018 sel[i] += nelt / 2;
4019 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4020 gcc_assert (perm_mask_low != NULL);
4021
4022 for (i = 0, n = exact_log2 (length); i < n; i++)
4023 {
4024 for (j = 0; j < length/2; j++)
4025 {
4026 vect1 = dr_chain[j];
4027 vect2 = dr_chain[j+length/2];
4028
4029 /* Create interleaving stmt:
4030 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4031 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4032 perm_stmt
4033 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4034 vect1, vect2, perm_mask_high);
4035 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4036 (*result_chain)[2*j] = high;
4037
4038 /* Create interleaving stmt:
4039 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4040 nelt*3/2+1, ...}> */
4041 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4042 perm_stmt
4043 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4044 vect1, vect2, perm_mask_low);
4045 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4046 (*result_chain)[2*j+1] = low;
4047 }
4048 memcpy (dr_chain.address (), result_chain->address (),
4049 length * sizeof (tree));
4050 }
4051 }
4052
4053 /* Function vect_setup_realignment
4054
4055 This function is called when vectorizing an unaligned load using
4056 the dr_explicit_realign[_optimized] scheme.
4057 This function generates the following code at the loop prolog:
4058
4059 p = initial_addr;
4060 x msq_init = *(floor(p)); # prolog load
4061 realignment_token = call target_builtin;
4062 loop:
4063 x msq = phi (msq_init, ---)
4064
4065 The stmts marked with x are generated only for the case of
4066 dr_explicit_realign_optimized.
4067
4068 The code above sets up a new (vector) pointer, pointing to the first
4069 location accessed by STMT, and a "floor-aligned" load using that pointer.
4070 It also generates code to compute the "realignment-token" (if the relevant
4071 target hook was defined), and creates a phi-node at the loop-header bb
4072 whose arguments are the result of the prolog-load (created by this
4073 function) and the result of a load that takes place in the loop (to be
4074 created by the caller to this function).
4075
4076 For the case of dr_explicit_realign_optimized:
4077 The caller to this function uses the phi-result (msq) to create the
4078 realignment code inside the loop, and sets up the missing phi argument,
4079 as follows:
4080 loop:
4081 msq = phi (msq_init, lsq)
4082 lsq = *(floor(p')); # load in loop
4083 result = realign_load (msq, lsq, realignment_token);
4084
4085 For the case of dr_explicit_realign:
4086 loop:
4087 msq = *(floor(p)); # load in loop
4088 p' = p + (VS-1);
4089 lsq = *(floor(p')); # load in loop
4090 result = realign_load (msq, lsq, realignment_token);
4091
4092 Input:
4093 STMT - (scalar) load stmt to be vectorized. This load accesses
4094 a memory location that may be unaligned.
4095 BSI - place where new code is to be inserted.
4096 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4097 is used.
4098
4099 Output:
4100 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4101 target hook, if defined.
4102 Return value - the result of the loop-header phi node. */
4103
4104 tree
4105 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4106 tree *realignment_token,
4107 enum dr_alignment_support alignment_support_scheme,
4108 tree init_addr,
4109 struct loop **at_loop)
4110 {
4111 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4112 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4113 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4114 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4115 struct loop *loop = NULL;
4116 edge pe = NULL;
4117 tree scalar_dest = gimple_assign_lhs (stmt);
4118 tree vec_dest;
4119 gimple inc;
4120 tree ptr;
4121 tree data_ref;
4122 gimple new_stmt;
4123 basic_block new_bb;
4124 tree msq_init = NULL_TREE;
4125 tree new_temp;
4126 gimple phi_stmt;
4127 tree msq = NULL_TREE;
4128 gimple_seq stmts = NULL;
4129 bool inv_p;
4130 bool compute_in_loop = false;
4131 bool nested_in_vect_loop = false;
4132 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4133 struct loop *loop_for_initial_load = NULL;
4134
4135 if (loop_vinfo)
4136 {
4137 loop = LOOP_VINFO_LOOP (loop_vinfo);
4138 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4139 }
4140
4141 gcc_assert (alignment_support_scheme == dr_explicit_realign
4142 || alignment_support_scheme == dr_explicit_realign_optimized);
4143
4144 /* We need to generate three things:
4145 1. the misalignment computation
4146 2. the extra vector load (for the optimized realignment scheme).
4147 3. the phi node for the two vectors from which the realignment is
4148 done (for the optimized realignment scheme). */
4149
4150 /* 1. Determine where to generate the misalignment computation.
4151
4152 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4153 calculation will be generated by this function, outside the loop (in the
4154 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4155 caller, inside the loop.
4156
4157 Background: If the misalignment remains fixed throughout the iterations of
4158 the loop, then both realignment schemes are applicable, and also the
4159 misalignment computation can be done outside LOOP. This is because we are
4160 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4161 are a multiple of VS (the Vector Size), and therefore the misalignment in
4162 different vectorized LOOP iterations is always the same.
4163 The problem arises only if the memory access is in an inner-loop nested
4164 inside LOOP, which is now being vectorized using outer-loop vectorization.
4165 This is the only case when the misalignment of the memory access may not
4166 remain fixed throughout the iterations of the inner-loop (as explained in
4167 detail in vect_supportable_dr_alignment). In this case, not only is the
4168 optimized realignment scheme not applicable, but also the misalignment
4169 computation (and generation of the realignment token that is passed to
4170 REALIGN_LOAD) have to be done inside the loop.
4171
4172 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4173 or not, which in turn determines if the misalignment is computed inside
4174 the inner-loop, or outside LOOP. */
4175
4176 if (init_addr != NULL_TREE || !loop_vinfo)
4177 {
4178 compute_in_loop = true;
4179 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4180 }
4181
4182
4183 /* 2. Determine where to generate the extra vector load.
4184
4185 For the optimized realignment scheme, instead of generating two vector
4186 loads in each iteration, we generate a single extra vector load in the
4187 preheader of the loop, and in each iteration reuse the result of the
4188 vector load from the previous iteration. In case the memory access is in
4189 an inner-loop nested inside LOOP, which is now being vectorized using
4190 outer-loop vectorization, we need to determine whether this initial vector
4191 load should be generated at the preheader of the inner-loop, or can be
4192 generated at the preheader of LOOP. If the memory access has no evolution
4193 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4194 to be generated inside LOOP (in the preheader of the inner-loop). */
4195
4196 if (nested_in_vect_loop)
4197 {
4198 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4199 bool invariant_in_outerloop =
4200 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4201 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4202 }
4203 else
4204 loop_for_initial_load = loop;
4205 if (at_loop)
4206 *at_loop = loop_for_initial_load;
4207
4208 if (loop_for_initial_load)
4209 pe = loop_preheader_edge (loop_for_initial_load);
4210
4211 /* 3. For the case of the optimized realignment, create the first vector
4212 load at the loop preheader. */
4213
4214 if (alignment_support_scheme == dr_explicit_realign_optimized)
4215 {
4216 /* Create msq_init = *(floor(p1)) in the loop preheader */
4217
4218 gcc_assert (!compute_in_loop);
4219 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4220 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4221 NULL_TREE, &init_addr, NULL, &inc,
4222 true, &inv_p);
4223 new_temp = copy_ssa_name (ptr, NULL);
4224 new_stmt = gimple_build_assign_with_ops
4225 (BIT_AND_EXPR, new_temp, ptr,
4226 build_int_cst (TREE_TYPE (ptr),
4227 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4228 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4229 gcc_assert (!new_bb);
4230 data_ref
4231 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4232 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4233 new_stmt = gimple_build_assign (vec_dest, data_ref);
4234 new_temp = make_ssa_name (vec_dest, new_stmt);
4235 gimple_assign_set_lhs (new_stmt, new_temp);
4236 if (pe)
4237 {
4238 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4239 gcc_assert (!new_bb);
4240 }
4241 else
4242 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4243
4244 msq_init = gimple_assign_lhs (new_stmt);
4245 }
4246
4247 /* 4. Create realignment token using a target builtin, if available.
4248 It is done either inside the containing loop, or before LOOP (as
4249 determined above). */
4250
4251 if (targetm.vectorize.builtin_mask_for_load)
4252 {
4253 tree builtin_decl;
4254
4255 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4256 if (!init_addr)
4257 {
4258 /* Generate the INIT_ADDR computation outside LOOP. */
4259 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4260 NULL_TREE, loop);
4261 if (loop)
4262 {
4263 pe = loop_preheader_edge (loop);
4264 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4265 gcc_assert (!new_bb);
4266 }
4267 else
4268 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4269 }
4270
4271 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4272 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4273 vec_dest =
4274 vect_create_destination_var (scalar_dest,
4275 gimple_call_return_type (new_stmt));
4276 new_temp = make_ssa_name (vec_dest, new_stmt);
4277 gimple_call_set_lhs (new_stmt, new_temp);
4278
4279 if (compute_in_loop)
4280 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4281 else
4282 {
4283 /* Generate the misalignment computation outside LOOP. */
4284 pe = loop_preheader_edge (loop);
4285 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4286 gcc_assert (!new_bb);
4287 }
4288
4289 *realignment_token = gimple_call_lhs (new_stmt);
4290
4291 /* The result of the CALL_EXPR to this builtin is determined from
4292 the value of the parameter and no global variables are touched
4293 which makes the builtin a "const" function. Requiring the
4294 builtin to have the "const" attribute makes it unnecessary
4295 to call mark_call_clobbered. */
4296 gcc_assert (TREE_READONLY (builtin_decl));
4297 }
4298
4299 if (alignment_support_scheme == dr_explicit_realign)
4300 return msq;
4301
4302 gcc_assert (!compute_in_loop);
4303 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4304
4305
4306 /* 5. Create msq = phi <msq_init, lsq> in loop */
4307
4308 pe = loop_preheader_edge (containing_loop);
4309 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4310 msq = make_ssa_name (vec_dest, NULL);
4311 phi_stmt = create_phi_node (msq, containing_loop->header);
4312 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4313
4314 return msq;
4315 }
4316
4317
4318 /* Function vect_grouped_load_supported.
4319
4320 Returns TRUE if even and odd permutations are supported,
4321 and FALSE otherwise. */
4322
4323 bool
4324 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4325 {
4326 enum machine_mode mode = TYPE_MODE (vectype);
4327
4328 /* vect_permute_load_chain requires the group size to be a power of two. */
4329 if (exact_log2 (count) == -1)
4330 {
4331 if (dump_enabled_p ())
4332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4333 "the size of the group of accesses"
4334 " is not a power of 2");
4335 return false;
4336 }
4337
4338 /* Check that the permutation is supported. */
4339 if (VECTOR_MODE_P (mode))
4340 {
4341 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4342 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4343
4344 for (i = 0; i < nelt; i++)
4345 sel[i] = i * 2;
4346 if (can_vec_perm_p (mode, false, sel))
4347 {
4348 for (i = 0; i < nelt; i++)
4349 sel[i] = i * 2 + 1;
4350 if (can_vec_perm_p (mode, false, sel))
4351 return true;
4352 }
4353 }
4354
4355 if (dump_enabled_p ())
4356 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4357 "extract even/odd not supported by target");
4358 return false;
4359 }
4360
4361 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4362 type VECTYPE. */
4363
4364 bool
4365 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4366 {
4367 return vect_lanes_optab_supported_p ("vec_load_lanes",
4368 vec_load_lanes_optab,
4369 vectype, count);
4370 }
4371
4372 /* Function vect_permute_load_chain.
4373
4374 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4375 a power of 2, generate extract_even/odd stmts to reorder the input data
4376 correctly. Return the final references for loads in RESULT_CHAIN.
4377
4378 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4379 The input is 4 vectors each containing 8 elements. We assign a number to each
4380 element, the input sequence is:
4381
4382 1st vec: 0 1 2 3 4 5 6 7
4383 2nd vec: 8 9 10 11 12 13 14 15
4384 3rd vec: 16 17 18 19 20 21 22 23
4385 4th vec: 24 25 26 27 28 29 30 31
4386
4387 The output sequence should be:
4388
4389 1st vec: 0 4 8 12 16 20 24 28
4390 2nd vec: 1 5 9 13 17 21 25 29
4391 3rd vec: 2 6 10 14 18 22 26 30
4392 4th vec: 3 7 11 15 19 23 27 31
4393
4394 i.e., the first output vector should contain the first elements of each
4395 interleaving group, etc.
4396
4397 We use extract_even/odd instructions to create such output. The input of
4398 each extract_even/odd operation is two vectors
4399 1st vec 2nd vec
4400 0 1 2 3 4 5 6 7
4401
4402 and the output is the vector of extracted even/odd elements. The output of
4403 extract_even will be: 0 2 4 6
4404 and of extract_odd: 1 3 5 7
4405
4406
4407 The permutation is done in log LENGTH stages. In each stage extract_even
4408 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4409 their order. In our example,
4410
4411 E1: extract_even (1st vec, 2nd vec)
4412 E2: extract_odd (1st vec, 2nd vec)
4413 E3: extract_even (3rd vec, 4th vec)
4414 E4: extract_odd (3rd vec, 4th vec)
4415
4416 The output for the first stage will be:
4417
4418 E1: 0 2 4 6 8 10 12 14
4419 E2: 1 3 5 7 9 11 13 15
4420 E3: 16 18 20 22 24 26 28 30
4421 E4: 17 19 21 23 25 27 29 31
4422
4423 In order to proceed and create the correct sequence for the next stage (or
4424 for the correct output, if the second stage is the last one, as in our
4425 example), we first put the output of extract_even operation and then the
4426 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4427 The input for the second stage is:
4428
4429 1st vec (E1): 0 2 4 6 8 10 12 14
4430 2nd vec (E3): 16 18 20 22 24 26 28 30
4431 3rd vec (E2): 1 3 5 7 9 11 13 15
4432 4th vec (E4): 17 19 21 23 25 27 29 31
4433
4434 The output of the second stage:
4435
4436 E1: 0 4 8 12 16 20 24 28
4437 E2: 2 6 10 14 18 22 26 30
4438 E3: 1 5 9 13 17 21 25 29
4439 E4: 3 7 11 15 19 23 27 31
4440
4441 And RESULT_CHAIN after reordering:
4442
4443 1st vec (E1): 0 4 8 12 16 20 24 28
4444 2nd vec (E3): 1 5 9 13 17 21 25 29
4445 3rd vec (E2): 2 6 10 14 18 22 26 30
4446 4th vec (E4): 3 7 11 15 19 23 27 31. */
4447
4448 static void
4449 vect_permute_load_chain (vec<tree> dr_chain,
4450 unsigned int length,
4451 gimple stmt,
4452 gimple_stmt_iterator *gsi,
4453 vec<tree> *result_chain)
4454 {
4455 tree data_ref, first_vect, second_vect;
4456 tree perm_mask_even, perm_mask_odd;
4457 gimple perm_stmt;
4458 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4459 unsigned int i, j, log_length = exact_log2 (length);
4460 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4461 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4462
4463 result_chain->quick_grow (length);
4464 memcpy (result_chain->address (), dr_chain.address (),
4465 length * sizeof (tree));
4466
4467 for (i = 0; i < nelt; ++i)
4468 sel[i] = i * 2;
4469 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4470 gcc_assert (perm_mask_even != NULL);
4471
4472 for (i = 0; i < nelt; ++i)
4473 sel[i] = i * 2 + 1;
4474 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4475 gcc_assert (perm_mask_odd != NULL);
4476
4477 for (i = 0; i < log_length; i++)
4478 {
4479 for (j = 0; j < length; j += 2)
4480 {
4481 first_vect = dr_chain[j];
4482 second_vect = dr_chain[j+1];
4483
4484 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4485 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4486 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4487 first_vect, second_vect,
4488 perm_mask_even);
4489 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4490 (*result_chain)[j/2] = data_ref;
4491
4492 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4493 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4494 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4495 first_vect, second_vect,
4496 perm_mask_odd);
4497 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4498 (*result_chain)[j/2+length/2] = data_ref;
4499 }
4500 memcpy (dr_chain.address (), result_chain->address (),
4501 length * sizeof (tree));
4502 }
4503 }
4504
4505
4506 /* Function vect_transform_grouped_load.
4507
4508 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4509 to perform their permutation and ascribe the result vectorized statements to
4510 the scalar statements.
4511 */
4512
4513 void
4514 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4515 gimple_stmt_iterator *gsi)
4516 {
4517 vec<tree> result_chain = vNULL;
4518
4519 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4520 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4521 vectors, that are ready for vector computation. */
4522 result_chain.create (size);
4523 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4524 vect_record_grouped_load_vectors (stmt, result_chain);
4525 result_chain.release ();
4526 }
4527
4528 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4529 generated as part of the vectorization of STMT. Assign the statement
4530 for each vector to the associated scalar statement. */
4531
4532 void
4533 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4534 {
4535 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4536 gimple next_stmt, new_stmt;
4537 unsigned int i, gap_count;
4538 tree tmp_data_ref;
4539
4540 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4541 Since we scan the chain starting from it's first node, their order
4542 corresponds the order of data-refs in RESULT_CHAIN. */
4543 next_stmt = first_stmt;
4544 gap_count = 1;
4545 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4546 {
4547 if (!next_stmt)
4548 break;
4549
4550 /* Skip the gaps. Loads created for the gaps will be removed by dead
4551 code elimination pass later. No need to check for the first stmt in
4552 the group, since it always exists.
4553 GROUP_GAP is the number of steps in elements from the previous
4554 access (if there is no gap GROUP_GAP is 1). We skip loads that
4555 correspond to the gaps. */
4556 if (next_stmt != first_stmt
4557 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4558 {
4559 gap_count++;
4560 continue;
4561 }
4562
4563 while (next_stmt)
4564 {
4565 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4566 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4567 copies, and we put the new vector statement in the first available
4568 RELATED_STMT. */
4569 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4570 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4571 else
4572 {
4573 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4574 {
4575 gimple prev_stmt =
4576 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4577 gimple rel_stmt =
4578 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4579 while (rel_stmt)
4580 {
4581 prev_stmt = rel_stmt;
4582 rel_stmt =
4583 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4584 }
4585
4586 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4587 new_stmt;
4588 }
4589 }
4590
4591 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4592 gap_count = 1;
4593 /* If NEXT_STMT accesses the same DR as the previous statement,
4594 put the same TMP_DATA_REF as its vectorized statement; otherwise
4595 get the next data-ref from RESULT_CHAIN. */
4596 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4597 break;
4598 }
4599 }
4600 }
4601
4602 /* Function vect_force_dr_alignment_p.
4603
4604 Returns whether the alignment of a DECL can be forced to be aligned
4605 on ALIGNMENT bit boundary. */
4606
4607 bool
4608 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4609 {
4610 if (TREE_CODE (decl) != VAR_DECL)
4611 return false;
4612
4613 /* We cannot change alignment of common or external symbols as another
4614 translation unit may contain a definition with lower alignment.
4615 The rules of common symbol linking mean that the definition
4616 will override the common symbol. The same is true for constant
4617 pool entries which may be shared and are not properly merged
4618 by LTO. */
4619 if (DECL_EXTERNAL (decl)
4620 || DECL_COMMON (decl)
4621 || DECL_IN_CONSTANT_POOL (decl))
4622 return false;
4623
4624 if (TREE_ASM_WRITTEN (decl))
4625 return false;
4626
4627 /* Do not override the alignment as specified by the ABI when the used
4628 attribute is set. */
4629 if (DECL_PRESERVE_P (decl))
4630 return false;
4631
4632 /* Do not override explicit alignment set by the user when an explicit
4633 section name is also used. This is a common idiom used by many
4634 software projects. */
4635 if (DECL_SECTION_NAME (decl) != NULL_TREE
4636 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4637 return false;
4638
4639 if (TREE_STATIC (decl))
4640 return (alignment <= MAX_OFILE_ALIGNMENT);
4641 else
4642 return (alignment <= MAX_STACK_ALIGNMENT);
4643 }
4644
4645
4646 /* Return whether the data reference DR is supported with respect to its
4647 alignment.
4648 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4649 it is aligned, i.e., check if it is possible to vectorize it with different
4650 alignment. */
4651
4652 enum dr_alignment_support
4653 vect_supportable_dr_alignment (struct data_reference *dr,
4654 bool check_aligned_accesses)
4655 {
4656 gimple stmt = DR_STMT (dr);
4657 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4658 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4659 enum machine_mode mode = TYPE_MODE (vectype);
4660 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4661 struct loop *vect_loop = NULL;
4662 bool nested_in_vect_loop = false;
4663
4664 if (aligned_access_p (dr) && !check_aligned_accesses)
4665 return dr_aligned;
4666
4667 if (loop_vinfo)
4668 {
4669 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4670 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4671 }
4672
4673 /* Possibly unaligned access. */
4674
4675 /* We can choose between using the implicit realignment scheme (generating
4676 a misaligned_move stmt) and the explicit realignment scheme (generating
4677 aligned loads with a REALIGN_LOAD). There are two variants to the
4678 explicit realignment scheme: optimized, and unoptimized.
4679 We can optimize the realignment only if the step between consecutive
4680 vector loads is equal to the vector size. Since the vector memory
4681 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4682 is guaranteed that the misalignment amount remains the same throughout the
4683 execution of the vectorized loop. Therefore, we can create the
4684 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4685 at the loop preheader.
4686
4687 However, in the case of outer-loop vectorization, when vectorizing a
4688 memory access in the inner-loop nested within the LOOP that is now being
4689 vectorized, while it is guaranteed that the misalignment of the
4690 vectorized memory access will remain the same in different outer-loop
4691 iterations, it is *not* guaranteed that is will remain the same throughout
4692 the execution of the inner-loop. This is because the inner-loop advances
4693 with the original scalar step (and not in steps of VS). If the inner-loop
4694 step happens to be a multiple of VS, then the misalignment remains fixed
4695 and we can use the optimized realignment scheme. For example:
4696
4697 for (i=0; i<N; i++)
4698 for (j=0; j<M; j++)
4699 s += a[i+j];
4700
4701 When vectorizing the i-loop in the above example, the step between
4702 consecutive vector loads is 1, and so the misalignment does not remain
4703 fixed across the execution of the inner-loop, and the realignment cannot
4704 be optimized (as illustrated in the following pseudo vectorized loop):
4705
4706 for (i=0; i<N; i+=4)
4707 for (j=0; j<M; j++){
4708 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4709 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4710 // (assuming that we start from an aligned address).
4711 }
4712
4713 We therefore have to use the unoptimized realignment scheme:
4714
4715 for (i=0; i<N; i+=4)
4716 for (j=k; j<M; j+=4)
4717 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4718 // that the misalignment of the initial address is
4719 // 0).
4720
4721 The loop can then be vectorized as follows:
4722
4723 for (k=0; k<4; k++){
4724 rt = get_realignment_token (&vp[k]);
4725 for (i=0; i<N; i+=4){
4726 v1 = vp[i+k];
4727 for (j=k; j<M; j+=4){
4728 v2 = vp[i+j+VS-1];
4729 va = REALIGN_LOAD <v1,v2,rt>;
4730 vs += va;
4731 v1 = v2;
4732 }
4733 }
4734 } */
4735
4736 if (DR_IS_READ (dr))
4737 {
4738 bool is_packed = false;
4739 tree type = (TREE_TYPE (DR_REF (dr)));
4740
4741 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4742 && (!targetm.vectorize.builtin_mask_for_load
4743 || targetm.vectorize.builtin_mask_for_load ()))
4744 {
4745 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4746 if ((nested_in_vect_loop
4747 && (TREE_INT_CST_LOW (DR_STEP (dr))
4748 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4749 || !loop_vinfo)
4750 return dr_explicit_realign;
4751 else
4752 return dr_explicit_realign_optimized;
4753 }
4754 if (!known_alignment_for_access_p (dr))
4755 is_packed = not_size_aligned (DR_REF (dr));
4756
4757 if (targetm.vectorize.
4758 support_vector_misalignment (mode, type,
4759 DR_MISALIGNMENT (dr), is_packed))
4760 /* Can't software pipeline the loads, but can at least do them. */
4761 return dr_unaligned_supported;
4762 }
4763 else
4764 {
4765 bool is_packed = false;
4766 tree type = (TREE_TYPE (DR_REF (dr)));
4767
4768 if (!known_alignment_for_access_p (dr))
4769 is_packed = not_size_aligned (DR_REF (dr));
4770
4771 if (targetm.vectorize.
4772 support_vector_misalignment (mode, type,
4773 DR_MISALIGNMENT (dr), is_packed))
4774 return dr_unaligned_supported;
4775 }
4776
4777 /* Unsupported. */
4778 return dr_unaligned_unsupported;
4779 }