579f6032280b102671157c6b7e28be191ffe998d
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003-2013 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "ggc.h"
28 #include "tree.h"
29 #include "tm_p.h"
30 #include "target.h"
31 #include "basic-block.h"
32 #include "gimple-pretty-print.h"
33 #include "tree-flow.h"
34 #include "dumpfile.h"
35 #include "cfgloop.h"
36 #include "tree-chrec.h"
37 #include "tree-scalar-evolution.h"
38 #include "tree-vectorizer.h"
39 #include "diagnostic-core.h"
40
41 /* Need to include rtl.h, expr.h, etc. for optabs. */
42 #include "expr.h"
43 #include "optabs.h"
44
45 /* Return true if load- or store-lanes optab OPTAB is implemented for
46 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
47
48 static bool
49 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
50 tree vectype, unsigned HOST_WIDE_INT count)
51 {
52 enum machine_mode mode, array_mode;
53 bool limit_p;
54
55 mode = TYPE_MODE (vectype);
56 limit_p = !targetm.array_mode_supported_p (mode, count);
57 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
58 MODE_INT, limit_p);
59
60 if (array_mode == BLKmode)
61 {
62 if (dump_enabled_p ())
63 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
64 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
65 GET_MODE_NAME (mode), count);
66 return false;
67 }
68
69 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
70 {
71 if (dump_enabled_p ())
72 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
73 "cannot use %s<%s><%s>", name,
74 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
75 return false;
76 }
77
78 if (dump_enabled_p ())
79 dump_printf_loc (MSG_NOTE, vect_location,
80 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
81 GET_MODE_NAME (mode));
82
83 return true;
84 }
85
86
87 /* Return the smallest scalar part of STMT.
88 This is used to determine the vectype of the stmt. We generally set the
89 vectype according to the type of the result (lhs). For stmts whose
90 result-type is different than the type of the arguments (e.g., demotion,
91 promotion), vectype will be reset appropriately (later). Note that we have
92 to visit the smallest datatype in this function, because that determines the
93 VF. If the smallest datatype in the loop is present only as the rhs of a
94 promotion operation - we'd miss it.
95 Such a case, where a variable of this datatype does not appear in the lhs
96 anywhere in the loop, can only occur if it's an invariant: e.g.:
97 'int_x = (int) short_inv', which we'd expect to have been optimized away by
98 invariant motion. However, we cannot rely on invariant motion to always
99 take invariants out of the loop, and so in the case of promotion we also
100 have to check the rhs.
101 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
102 types. */
103
104 tree
105 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
106 HOST_WIDE_INT *rhs_size_unit)
107 {
108 tree scalar_type = gimple_expr_type (stmt);
109 HOST_WIDE_INT lhs, rhs;
110
111 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
112
113 if (is_gimple_assign (stmt)
114 && (gimple_assign_cast_p (stmt)
115 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
116 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
117 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
118 {
119 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
120
121 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
122 if (rhs < lhs)
123 scalar_type = rhs_type;
124 }
125
126 *lhs_size_unit = lhs;
127 *rhs_size_unit = rhs;
128 return scalar_type;
129 }
130
131
132 /* Find the place of the data-ref in STMT in the interleaving chain that starts
133 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
134
135 int
136 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
137 {
138 gimple next_stmt = first_stmt;
139 int result = 0;
140
141 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
142 return -1;
143
144 while (next_stmt && next_stmt != stmt)
145 {
146 result++;
147 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
148 }
149
150 if (next_stmt)
151 return result;
152 else
153 return -1;
154 }
155
156
157 /* Check if data references pointed by DR_I and DR_J are same or
158 belong to same interleaving group. Return FALSE if drs are
159 different, otherwise return TRUE. */
160
161 static bool
162 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
163 {
164 gimple stmt_i = DR_STMT (dr_i);
165 gimple stmt_j = DR_STMT (dr_j);
166
167 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
168 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
169 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
170 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
171 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
172 return true;
173 else
174 return false;
175 }
176
177 /* If address ranges represented by DDR_I and DDR_J are equal,
178 return TRUE, otherwise return FALSE. */
179
180 static bool
181 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
182 {
183 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
184 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
185 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
186 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
187 return true;
188 else
189 return false;
190 }
191
192 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
193 tested at run-time. Return TRUE if DDR was successfully inserted.
194 Return false if versioning is not supported. */
195
196 static bool
197 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
198 {
199 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
200
201 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
202 return false;
203
204 if (dump_enabled_p ())
205 {
206 dump_printf_loc (MSG_NOTE, vect_location,
207 "mark for run-time aliasing test between ");
208 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
209 dump_printf (MSG_NOTE, " and ");
210 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
211 }
212
213 if (optimize_loop_nest_for_size_p (loop))
214 {
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
217 "versioning not supported when optimizing for size.");
218 return false;
219 }
220
221 /* FORNOW: We don't support versioning with outer-loop vectorization. */
222 if (loop->inner)
223 {
224 if (dump_enabled_p ())
225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
226 "versioning not yet supported for outer-loops.");
227 return false;
228 }
229
230 /* FORNOW: We don't support creating runtime alias tests for non-constant
231 step. */
232 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
233 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
234 {
235 if (dump_enabled_p ())
236 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
237 "versioning not yet supported for non-constant "
238 "step");
239 return false;
240 }
241
242 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).safe_push (ddr);
243 return true;
244 }
245
246
247 /* Function vect_analyze_data_ref_dependence.
248
249 Return TRUE if there (might) exist a dependence between a memory-reference
250 DRA and a memory-reference DRB. When versioning for alias may check a
251 dependence at run-time, return FALSE. Adjust *MAX_VF according to
252 the data dependence. */
253
254 static bool
255 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
256 loop_vec_info loop_vinfo, int *max_vf)
257 {
258 unsigned int i;
259 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
260 struct data_reference *dra = DDR_A (ddr);
261 struct data_reference *drb = DDR_B (ddr);
262 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
263 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
264 lambda_vector dist_v;
265 unsigned int loop_depth;
266
267 /* In loop analysis all data references should be vectorizable. */
268 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
269 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
270 gcc_unreachable ();
271
272 /* Independent data accesses. */
273 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
274 return false;
275
276 if (dra == drb
277 || (DR_IS_READ (dra) && DR_IS_READ (drb)))
278 return false;
279
280 /* Unknown data dependence. */
281 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
282 {
283 if (dump_enabled_p ())
284 {
285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
286 "versioning for alias required: "
287 "can't determine dependence between ");
288 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
289 DR_REF (dra));
290 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
291 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
292 DR_REF (drb));
293 }
294
295 /* Add to list of ddrs that need to be tested at run-time. */
296 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
297 }
298
299 /* Known data dependence. */
300 if (DDR_NUM_DIST_VECTS (ddr) == 0)
301 {
302 if (dump_enabled_p ())
303 {
304 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
305 "versioning for alias required: "
306 "bad dist vector for ");
307 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
308 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
309 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
310 }
311 /* Add to list of ddrs that need to be tested at run-time. */
312 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
313 }
314
315 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
316 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
317 {
318 int dist = dist_v[loop_depth];
319
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "dependence distance = %d.", dist);
323
324 if (dist == 0)
325 {
326 if (dump_enabled_p ())
327 {
328 dump_printf_loc (MSG_NOTE, vect_location,
329 "dependence distance == 0 between ");
330 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
331 dump_printf (MSG_NOTE, " and ");
332 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
333 }
334
335 /* For interleaving, mark that there is a read-write dependency if
336 necessary. We check before that one of the data-refs is store. */
337 if (DR_IS_READ (dra))
338 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
339 else
340 {
341 if (DR_IS_READ (drb))
342 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
343 }
344
345 continue;
346 }
347
348 if (dist > 0 && DDR_REVERSED_P (ddr))
349 {
350 /* If DDR_REVERSED_P the order of the data-refs in DDR was
351 reversed (to make distance vector positive), and the actual
352 distance is negative. */
353 if (dump_enabled_p ())
354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
355 "dependence distance negative.");
356 continue;
357 }
358
359 if (abs (dist) >= 2
360 && abs (dist) < *max_vf)
361 {
362 /* The dependence distance requires reduction of the maximal
363 vectorization factor. */
364 *max_vf = abs (dist);
365 if (dump_enabled_p ())
366 dump_printf_loc (MSG_NOTE, vect_location,
367 "adjusting maximal vectorization factor to %i",
368 *max_vf);
369 }
370
371 if (abs (dist) >= *max_vf)
372 {
373 /* Dependence distance does not create dependence, as far as
374 vectorization is concerned, in this case. */
375 if (dump_enabled_p ())
376 dump_printf_loc (MSG_NOTE, vect_location,
377 "dependence distance >= VF.");
378 continue;
379 }
380
381 if (dump_enabled_p ())
382 {
383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
384 "not vectorized, possible dependence "
385 "between data-refs ");
386 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
387 dump_printf (MSG_NOTE, " and ");
388 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
389 }
390
391 return true;
392 }
393
394 return false;
395 }
396
397 /* Function vect_analyze_data_ref_dependences.
398
399 Examine all the data references in the loop, and make sure there do not
400 exist any data dependences between them. Set *MAX_VF according to
401 the maximum vectorization factor the data dependences allow. */
402
403 bool
404 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo, int *max_vf)
405 {
406 unsigned int i;
407 struct data_dependence_relation *ddr;
408
409 if (dump_enabled_p ())
410 dump_printf_loc (MSG_NOTE, vect_location,
411 "=== vect_analyze_data_ref_dependences ===");
412
413 if (!compute_all_dependences (LOOP_VINFO_DATAREFS (loop_vinfo),
414 &LOOP_VINFO_DDRS (loop_vinfo),
415 LOOP_VINFO_LOOP_NEST (loop_vinfo), true))
416 return false;
417
418 FOR_EACH_VEC_ELT (LOOP_VINFO_DDRS (loop_vinfo), i, ddr)
419 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
420 return false;
421
422 return true;
423 }
424
425
426 /* Function vect_slp_analyze_data_ref_dependence.
427
428 Return TRUE if there (might) exist a dependence between a memory-reference
429 DRA and a memory-reference DRB. When versioning for alias may check a
430 dependence at run-time, return FALSE. Adjust *MAX_VF according to
431 the data dependence. */
432
433 static bool
434 vect_slp_analyze_data_ref_dependence (struct data_dependence_relation *ddr)
435 {
436 struct data_reference *dra = DDR_A (ddr);
437 struct data_reference *drb = DDR_B (ddr);
438
439 /* We need to check dependences of statements marked as unvectorizable
440 as well, they still can prohibit vectorization. */
441
442 /* Independent data accesses. */
443 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
444 return false;
445
446 if (dra == drb)
447 return false;
448
449 /* Read-read is OK. */
450 if (DR_IS_READ (dra) && DR_IS_READ (drb))
451 return false;
452
453 /* Unknown data dependence. */
454 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
455 {
456 gimple earlier_stmt;
457
458 if (dump_enabled_p ())
459 {
460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
461 "can't determine dependence between ");
462 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
463 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
464 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
465 }
466
467 /* We do not vectorize basic blocks with write-write dependencies. */
468 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
469 return true;
470
471 /* Check that it's not a load-after-store dependence. */
472 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
473 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
474 return true;
475
476 return false;
477 }
478
479 if (dump_enabled_p ())
480 {
481 dump_printf_loc (MSG_NOTE, vect_location,
482 "determined dependence between ");
483 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
484 dump_printf (MSG_NOTE, " and ");
485 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
486 }
487
488 /* Do not vectorize basic blocks with write-write dependences. */
489 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
490 return true;
491
492 /* Check dependence between DRA and DRB for basic block vectorization.
493 If the accesses share same bases and offsets, we can compare their initial
494 constant offsets to decide whether they differ or not. In case of a read-
495 write dependence we check that the load is before the store to ensure that
496 vectorization will not change the order of the accesses. */
497
498 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
499 gimple earlier_stmt;
500
501 /* Check that the data-refs have same bases and offsets. If not, we can't
502 determine if they are dependent. */
503 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
504 || !dr_equal_offsets_p (dra, drb))
505 return true;
506
507 /* Check the types. */
508 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
509 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
510
511 if (type_size_a != type_size_b
512 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
513 TREE_TYPE (DR_REF (drb))))
514 return true;
515
516 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
517 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
518
519 /* Two different locations - no dependence. */
520 if (init_a != init_b)
521 return false;
522
523 /* We have a read-write dependence. Check that the load is before the store.
524 When we vectorize basic blocks, vector load can be only before
525 corresponding scalar load, and vector store can be only after its
526 corresponding scalar store. So the order of the acceses is preserved in
527 case the load is before the store. */
528 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
529 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
530 return false;
531
532 return true;
533 }
534
535
536 /* Function vect_analyze_data_ref_dependences.
537
538 Examine all the data references in the basic-block, and make sure there
539 do not exist any data dependences between them. Set *MAX_VF according to
540 the maximum vectorization factor the data dependences allow. */
541
542 bool
543 vect_slp_analyze_data_ref_dependences (bb_vec_info bb_vinfo)
544 {
545 struct data_dependence_relation *ddr;
546 unsigned int i;
547
548 if (dump_enabled_p ())
549 dump_printf_loc (MSG_NOTE, vect_location,
550 "=== vect_slp_analyze_data_ref_dependences ===");
551
552 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
553 &BB_VINFO_DDRS (bb_vinfo),
554 vNULL, true))
555 return false;
556
557 FOR_EACH_VEC_ELT (BB_VINFO_DDRS (bb_vinfo), i, ddr)
558 if (vect_slp_analyze_data_ref_dependence (ddr))
559 return false;
560
561 return true;
562 }
563
564
565 /* Function vect_compute_data_ref_alignment
566
567 Compute the misalignment of the data reference DR.
568
569 Output:
570 1. If during the misalignment computation it is found that the data reference
571 cannot be vectorized then false is returned.
572 2. DR_MISALIGNMENT (DR) is defined.
573
574 FOR NOW: No analysis is actually performed. Misalignment is calculated
575 only for trivial cases. TODO. */
576
577 static bool
578 vect_compute_data_ref_alignment (struct data_reference *dr)
579 {
580 gimple stmt = DR_STMT (dr);
581 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
582 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
583 struct loop *loop = NULL;
584 tree ref = DR_REF (dr);
585 tree vectype;
586 tree base, base_addr;
587 bool base_aligned;
588 tree misalign;
589 tree aligned_to, alignment;
590
591 if (dump_enabled_p ())
592 dump_printf_loc (MSG_NOTE, vect_location,
593 "vect_compute_data_ref_alignment:");
594
595 if (loop_vinfo)
596 loop = LOOP_VINFO_LOOP (loop_vinfo);
597
598 /* Initialize misalignment to unknown. */
599 SET_DR_MISALIGNMENT (dr, -1);
600
601 /* Strided loads perform only component accesses, misalignment information
602 is irrelevant for them. */
603 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
604 return true;
605
606 misalign = DR_INIT (dr);
607 aligned_to = DR_ALIGNED_TO (dr);
608 base_addr = DR_BASE_ADDRESS (dr);
609 vectype = STMT_VINFO_VECTYPE (stmt_info);
610
611 /* In case the dataref is in an inner-loop of the loop that is being
612 vectorized (LOOP), we use the base and misalignment information
613 relative to the outer-loop (LOOP). This is ok only if the misalignment
614 stays the same throughout the execution of the inner-loop, which is why
615 we have to check that the stride of the dataref in the inner-loop evenly
616 divides by the vector size. */
617 if (loop && nested_in_vect_loop_p (loop, stmt))
618 {
619 tree step = DR_STEP (dr);
620 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
621
622 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
623 {
624 if (dump_enabled_p ())
625 dump_printf_loc (MSG_NOTE, vect_location,
626 "inner step divides the vector-size.");
627 misalign = STMT_VINFO_DR_INIT (stmt_info);
628 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
629 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
630 }
631 else
632 {
633 if (dump_enabled_p ())
634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
635 "inner step doesn't divide the vector-size.");
636 misalign = NULL_TREE;
637 }
638 }
639
640 /* Similarly, if we're doing basic-block vectorization, we can only use
641 base and misalignment information relative to an innermost loop if the
642 misalignment stays the same throughout the execution of the loop.
643 As above, this is the case if the stride of the dataref evenly divides
644 by the vector size. */
645 if (!loop)
646 {
647 tree step = DR_STEP (dr);
648 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
649
650 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
651 {
652 if (dump_enabled_p ())
653 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
654 "SLP: step doesn't divide the vector-size.");
655 misalign = NULL_TREE;
656 }
657 }
658
659 base = build_fold_indirect_ref (base_addr);
660 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
661
662 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
663 || !misalign)
664 {
665 if (dump_enabled_p ())
666 {
667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
668 "Unknown alignment for access: ");
669 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
670 }
671 return true;
672 }
673
674 if ((DECL_P (base)
675 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
676 alignment) >= 0)
677 || (TREE_CODE (base_addr) == SSA_NAME
678 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
679 TREE_TYPE (base_addr)))),
680 alignment) >= 0)
681 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
682 base_aligned = true;
683 else
684 base_aligned = false;
685
686 if (!base_aligned)
687 {
688 /* Do not change the alignment of global variables here if
689 flag_section_anchors is enabled as we already generated
690 RTL for other functions. Most global variables should
691 have been aligned during the IPA increase_alignment pass. */
692 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
693 || (TREE_STATIC (base) && flag_section_anchors))
694 {
695 if (dump_enabled_p ())
696 {
697 dump_printf_loc (MSG_NOTE, vect_location,
698 "can't force alignment of ref: ");
699 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
700 }
701 return true;
702 }
703
704 /* Force the alignment of the decl.
705 NOTE: This is the only change to the code we make during
706 the analysis phase, before deciding to vectorize the loop. */
707 if (dump_enabled_p ())
708 {
709 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
710 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
711 }
712
713 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
714 DECL_USER_ALIGN (base) = 1;
715 }
716
717 /* At this point we assume that the base is aligned. */
718 gcc_assert (base_aligned
719 || (TREE_CODE (base) == VAR_DECL
720 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
721
722 /* If this is a backward running DR then first access in the larger
723 vectype actually is N-1 elements before the address in the DR.
724 Adjust misalign accordingly. */
725 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
726 {
727 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
728 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
729 otherwise we wouldn't be here. */
730 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
731 /* PLUS because DR_STEP was negative. */
732 misalign = size_binop (PLUS_EXPR, misalign, offset);
733 }
734
735 /* Modulo alignment. */
736 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
737
738 if (!host_integerp (misalign, 1))
739 {
740 /* Negative or overflowed misalignment value. */
741 if (dump_enabled_p ())
742 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
743 "unexpected misalign value");
744 return false;
745 }
746
747 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
748
749 if (dump_enabled_p ())
750 {
751 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
752 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
753 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
754 }
755
756 return true;
757 }
758
759
760 /* Function vect_compute_data_refs_alignment
761
762 Compute the misalignment of data references in the loop.
763 Return FALSE if a data reference is found that cannot be vectorized. */
764
765 static bool
766 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
767 bb_vec_info bb_vinfo)
768 {
769 vec<data_reference_p> datarefs;
770 struct data_reference *dr;
771 unsigned int i;
772
773 if (loop_vinfo)
774 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
775 else
776 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
777
778 FOR_EACH_VEC_ELT (datarefs, i, dr)
779 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
780 && !vect_compute_data_ref_alignment (dr))
781 {
782 if (bb_vinfo)
783 {
784 /* Mark unsupported statement as unvectorizable. */
785 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
786 continue;
787 }
788 else
789 return false;
790 }
791
792 return true;
793 }
794
795
796 /* Function vect_update_misalignment_for_peel
797
798 DR - the data reference whose misalignment is to be adjusted.
799 DR_PEEL - the data reference whose misalignment is being made
800 zero in the vector loop by the peel.
801 NPEEL - the number of iterations in the peel loop if the misalignment
802 of DR_PEEL is known at compile time. */
803
804 static void
805 vect_update_misalignment_for_peel (struct data_reference *dr,
806 struct data_reference *dr_peel, int npeel)
807 {
808 unsigned int i;
809 vec<dr_p> same_align_drs;
810 struct data_reference *current_dr;
811 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
812 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
813 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
814 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
815
816 /* For interleaved data accesses the step in the loop must be multiplied by
817 the size of the interleaving group. */
818 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
819 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
820 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
821 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
822
823 /* It can be assumed that the data refs with the same alignment as dr_peel
824 are aligned in the vector loop. */
825 same_align_drs
826 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
827 FOR_EACH_VEC_ELT (same_align_drs, i, current_dr)
828 {
829 if (current_dr != dr)
830 continue;
831 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
832 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
833 SET_DR_MISALIGNMENT (dr, 0);
834 return;
835 }
836
837 if (known_alignment_for_access_p (dr)
838 && known_alignment_for_access_p (dr_peel))
839 {
840 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
841 int misal = DR_MISALIGNMENT (dr);
842 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
843 misal += negative ? -npeel * dr_size : npeel * dr_size;
844 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
845 SET_DR_MISALIGNMENT (dr, misal);
846 return;
847 }
848
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
851 SET_DR_MISALIGNMENT (dr, -1);
852 }
853
854
855 /* Function vect_verify_datarefs_alignment
856
857 Return TRUE if all data references in the loop can be
858 handled with respect to alignment. */
859
860 bool
861 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
862 {
863 vec<data_reference_p> datarefs;
864 struct data_reference *dr;
865 enum dr_alignment_support supportable_dr_alignment;
866 unsigned int i;
867
868 if (loop_vinfo)
869 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
870 else
871 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
872
873 FOR_EACH_VEC_ELT (datarefs, i, dr)
874 {
875 gimple stmt = DR_STMT (dr);
876 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
877
878 if (!STMT_VINFO_RELEVANT_P (stmt_info))
879 continue;
880
881 /* For interleaving, only the alignment of the first access matters.
882 Skip statements marked as not vectorizable. */
883 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
884 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
885 || !STMT_VINFO_VECTORIZABLE (stmt_info))
886 continue;
887
888 /* Strided loads perform only component accesses, alignment is
889 irrelevant for them. */
890 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
891 continue;
892
893 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
894 if (!supportable_dr_alignment)
895 {
896 if (dump_enabled_p ())
897 {
898 if (DR_IS_READ (dr))
899 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
900 "not vectorized: unsupported unaligned load.");
901 else
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "not vectorized: unsupported unaligned "
904 "store.");
905
906 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
907 DR_REF (dr));
908 }
909 return false;
910 }
911 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
912 dump_printf_loc (MSG_NOTE, vect_location,
913 "Vectorizing an unaligned access.");
914 }
915 return true;
916 }
917
918 /* Given an memory reference EXP return whether its alignment is less
919 than its size. */
920
921 static bool
922 not_size_aligned (tree exp)
923 {
924 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
925 return true;
926
927 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
928 > get_object_alignment (exp));
929 }
930
931 /* Function vector_alignment_reachable_p
932
933 Return true if vector alignment for DR is reachable by peeling
934 a few loop iterations. Return false otherwise. */
935
936 static bool
937 vector_alignment_reachable_p (struct data_reference *dr)
938 {
939 gimple stmt = DR_STMT (dr);
940 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
941 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
942
943 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
944 {
945 /* For interleaved access we peel only if number of iterations in
946 the prolog loop ({VF - misalignment}), is a multiple of the
947 number of the interleaved accesses. */
948 int elem_size, mis_in_elements;
949 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
950
951 /* FORNOW: handle only known alignment. */
952 if (!known_alignment_for_access_p (dr))
953 return false;
954
955 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
956 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
957
958 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
959 return false;
960 }
961
962 /* If misalignment is known at the compile time then allow peeling
963 only if natural alignment is reachable through peeling. */
964 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
965 {
966 HOST_WIDE_INT elmsize =
967 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
968 if (dump_enabled_p ())
969 {
970 dump_printf_loc (MSG_NOTE, vect_location,
971 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
972 dump_printf (MSG_NOTE,
973 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
974 }
975 if (DR_MISALIGNMENT (dr) % elmsize)
976 {
977 if (dump_enabled_p ())
978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
979 "data size does not divide the misalignment.\n");
980 return false;
981 }
982 }
983
984 if (!known_alignment_for_access_p (dr))
985 {
986 tree type = TREE_TYPE (DR_REF (dr));
987 bool is_packed = not_size_aligned (DR_REF (dr));
988 if (dump_enabled_p ())
989 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
990 "Unknown misalignment, is_packed = %d",is_packed);
991 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
992 return true;
993 else
994 return false;
995 }
996
997 return true;
998 }
999
1000
1001 /* Calculate the cost of the memory access represented by DR. */
1002
1003 static void
1004 vect_get_data_access_cost (struct data_reference *dr,
1005 unsigned int *inside_cost,
1006 unsigned int *outside_cost,
1007 stmt_vector_for_cost *body_cost_vec)
1008 {
1009 gimple stmt = DR_STMT (dr);
1010 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1011 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1012 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1013 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1014 int ncopies = vf / nunits;
1015
1016 if (DR_IS_READ (dr))
1017 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1018 NULL, body_cost_vec, false);
1019 else
1020 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1021
1022 if (dump_enabled_p ())
1023 dump_printf_loc (MSG_NOTE, vect_location,
1024 "vect_get_data_access_cost: inside_cost = %d, "
1025 "outside_cost = %d.", *inside_cost, *outside_cost);
1026 }
1027
1028
1029 static hashval_t
1030 vect_peeling_hash (const void *elem)
1031 {
1032 const struct _vect_peel_info *peel_info;
1033
1034 peel_info = (const struct _vect_peel_info *) elem;
1035 return (hashval_t) peel_info->npeel;
1036 }
1037
1038
1039 static int
1040 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1041 {
1042 const struct _vect_peel_info *a, *b;
1043
1044 a = (const struct _vect_peel_info *) elem1;
1045 b = (const struct _vect_peel_info *) elem2;
1046 return (a->npeel == b->npeel);
1047 }
1048
1049
1050 /* Insert DR into peeling hash table with NPEEL as key. */
1051
1052 static void
1053 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1054 int npeel)
1055 {
1056 struct _vect_peel_info elem, *slot;
1057 void **new_slot;
1058 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1059
1060 elem.npeel = npeel;
1061 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1062 &elem);
1063 if (slot)
1064 slot->count++;
1065 else
1066 {
1067 slot = XNEW (struct _vect_peel_info);
1068 slot->npeel = npeel;
1069 slot->dr = dr;
1070 slot->count = 1;
1071 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1072 INSERT);
1073 *new_slot = slot;
1074 }
1075
1076 if (!supportable_dr_alignment && !flag_vect_cost_model)
1077 slot->count += VECT_MAX_COST;
1078 }
1079
1080
1081 /* Traverse peeling hash table to find peeling option that aligns maximum
1082 number of data accesses. */
1083
1084 static int
1085 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1086 {
1087 vect_peel_info elem = (vect_peel_info) *slot;
1088 vect_peel_extended_info max = (vect_peel_extended_info) data;
1089
1090 if (elem->count > max->peel_info.count
1091 || (elem->count == max->peel_info.count
1092 && max->peel_info.npeel > elem->npeel))
1093 {
1094 max->peel_info.npeel = elem->npeel;
1095 max->peel_info.count = elem->count;
1096 max->peel_info.dr = elem->dr;
1097 }
1098
1099 return 1;
1100 }
1101
1102
1103 /* Traverse peeling hash table and calculate cost for each peeling option.
1104 Find the one with the lowest cost. */
1105
1106 static int
1107 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1108 {
1109 vect_peel_info elem = (vect_peel_info) *slot;
1110 vect_peel_extended_info min = (vect_peel_extended_info) data;
1111 int save_misalignment, dummy;
1112 unsigned int inside_cost = 0, outside_cost = 0, i;
1113 gimple stmt = DR_STMT (elem->dr);
1114 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1115 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1116 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1117 struct data_reference *dr;
1118 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1119 int single_iter_cost;
1120
1121 prologue_cost_vec.create (2);
1122 body_cost_vec.create (2);
1123 epilogue_cost_vec.create (2);
1124
1125 FOR_EACH_VEC_ELT (datarefs, i, dr)
1126 {
1127 stmt = DR_STMT (dr);
1128 stmt_info = vinfo_for_stmt (stmt);
1129 /* For interleaving, only the alignment of the first access
1130 matters. */
1131 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1132 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1133 continue;
1134
1135 save_misalignment = DR_MISALIGNMENT (dr);
1136 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1137 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1138 &body_cost_vec);
1139 SET_DR_MISALIGNMENT (dr, save_misalignment);
1140 }
1141
1142 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1143 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1144 &dummy, single_iter_cost,
1145 &prologue_cost_vec,
1146 &epilogue_cost_vec);
1147
1148 /* Prologue and epilogue costs are added to the target model later.
1149 These costs depend only on the scalar iteration cost, the
1150 number of peeling iterations finally chosen, and the number of
1151 misaligned statements. So discard the information found here. */
1152 prologue_cost_vec.release ();
1153 epilogue_cost_vec.release ();
1154
1155 if (inside_cost < min->inside_cost
1156 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1157 {
1158 min->inside_cost = inside_cost;
1159 min->outside_cost = outside_cost;
1160 min->body_cost_vec.release ();
1161 min->body_cost_vec = body_cost_vec;
1162 min->peel_info.dr = elem->dr;
1163 min->peel_info.npeel = elem->npeel;
1164 }
1165 else
1166 body_cost_vec.release ();
1167
1168 return 1;
1169 }
1170
1171
1172 /* Choose best peeling option by traversing peeling hash table and either
1173 choosing an option with the lowest cost (if cost model is enabled) or the
1174 option that aligns as many accesses as possible. */
1175
1176 static struct data_reference *
1177 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1178 unsigned int *npeel,
1179 stmt_vector_for_cost *body_cost_vec)
1180 {
1181 struct _vect_peel_extended_info res;
1182
1183 res.peel_info.dr = NULL;
1184 res.body_cost_vec = stmt_vector_for_cost();
1185
1186 if (flag_vect_cost_model)
1187 {
1188 res.inside_cost = INT_MAX;
1189 res.outside_cost = INT_MAX;
1190 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1191 vect_peeling_hash_get_lowest_cost, &res);
1192 }
1193 else
1194 {
1195 res.peel_info.count = 0;
1196 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1197 vect_peeling_hash_get_most_frequent, &res);
1198 }
1199
1200 *npeel = res.peel_info.npeel;
1201 *body_cost_vec = res.body_cost_vec;
1202 return res.peel_info.dr;
1203 }
1204
1205
1206 /* Function vect_enhance_data_refs_alignment
1207
1208 This pass will use loop versioning and loop peeling in order to enhance
1209 the alignment of data references in the loop.
1210
1211 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1212 original loop is to be vectorized. Any other loops that are created by
1213 the transformations performed in this pass - are not supposed to be
1214 vectorized. This restriction will be relaxed.
1215
1216 This pass will require a cost model to guide it whether to apply peeling
1217 or versioning or a combination of the two. For example, the scheme that
1218 intel uses when given a loop with several memory accesses, is as follows:
1219 choose one memory access ('p') which alignment you want to force by doing
1220 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1221 other accesses are not necessarily aligned, or (2) use loop versioning to
1222 generate one loop in which all accesses are aligned, and another loop in
1223 which only 'p' is necessarily aligned.
1224
1225 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1226 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1227 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1228
1229 Devising a cost model is the most critical aspect of this work. It will
1230 guide us on which access to peel for, whether to use loop versioning, how
1231 many versions to create, etc. The cost model will probably consist of
1232 generic considerations as well as target specific considerations (on
1233 powerpc for example, misaligned stores are more painful than misaligned
1234 loads).
1235
1236 Here are the general steps involved in alignment enhancements:
1237
1238 -- original loop, before alignment analysis:
1239 for (i=0; i<N; i++){
1240 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1241 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1242 }
1243
1244 -- After vect_compute_data_refs_alignment:
1245 for (i=0; i<N; i++){
1246 x = q[i]; # DR_MISALIGNMENT(q) = 3
1247 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1248 }
1249
1250 -- Possibility 1: we do loop versioning:
1251 if (p is aligned) {
1252 for (i=0; i<N; i++){ # loop 1A
1253 x = q[i]; # DR_MISALIGNMENT(q) = 3
1254 p[i] = y; # DR_MISALIGNMENT(p) = 0
1255 }
1256 }
1257 else {
1258 for (i=0; i<N; i++){ # loop 1B
1259 x = q[i]; # DR_MISALIGNMENT(q) = 3
1260 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1261 }
1262 }
1263
1264 -- Possibility 2: we do loop peeling:
1265 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1266 x = q[i];
1267 p[i] = y;
1268 }
1269 for (i = 3; i < N; i++){ # loop 2A
1270 x = q[i]; # DR_MISALIGNMENT(q) = 0
1271 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1272 }
1273
1274 -- Possibility 3: combination of loop peeling and versioning:
1275 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1276 x = q[i];
1277 p[i] = y;
1278 }
1279 if (p is aligned) {
1280 for (i = 3; i<N; i++){ # loop 3A
1281 x = q[i]; # DR_MISALIGNMENT(q) = 0
1282 p[i] = y; # DR_MISALIGNMENT(p) = 0
1283 }
1284 }
1285 else {
1286 for (i = 3; i<N; i++){ # loop 3B
1287 x = q[i]; # DR_MISALIGNMENT(q) = 0
1288 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1289 }
1290 }
1291
1292 These loops are later passed to loop_transform to be vectorized. The
1293 vectorizer will use the alignment information to guide the transformation
1294 (whether to generate regular loads/stores, or with special handling for
1295 misalignment). */
1296
1297 bool
1298 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1299 {
1300 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1301 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1302 enum dr_alignment_support supportable_dr_alignment;
1303 struct data_reference *dr0 = NULL, *first_store = NULL;
1304 struct data_reference *dr;
1305 unsigned int i, j;
1306 bool do_peeling = false;
1307 bool do_versioning = false;
1308 bool stat;
1309 gimple stmt;
1310 stmt_vec_info stmt_info;
1311 int vect_versioning_for_alias_required;
1312 unsigned int npeel = 0;
1313 bool all_misalignments_unknown = true;
1314 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1315 unsigned possible_npeel_number = 1;
1316 tree vectype;
1317 unsigned int nelements, mis, same_align_drs_max = 0;
1318 stmt_vector_for_cost body_cost_vec = stmt_vector_for_cost();
1319
1320 if (dump_enabled_p ())
1321 dump_printf_loc (MSG_NOTE, vect_location,
1322 "=== vect_enhance_data_refs_alignment ===");
1323
1324 /* While cost model enhancements are expected in the future, the high level
1325 view of the code at this time is as follows:
1326
1327 A) If there is a misaligned access then see if peeling to align
1328 this access can make all data references satisfy
1329 vect_supportable_dr_alignment. If so, update data structures
1330 as needed and return true.
1331
1332 B) If peeling wasn't possible and there is a data reference with an
1333 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1334 then see if loop versioning checks can be used to make all data
1335 references satisfy vect_supportable_dr_alignment. If so, update
1336 data structures as needed and return true.
1337
1338 C) If neither peeling nor versioning were successful then return false if
1339 any data reference does not satisfy vect_supportable_dr_alignment.
1340
1341 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1342
1343 Note, Possibility 3 above (which is peeling and versioning together) is not
1344 being done at this time. */
1345
1346 /* (1) Peeling to force alignment. */
1347
1348 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1349 Considerations:
1350 + How many accesses will become aligned due to the peeling
1351 - How many accesses will become unaligned due to the peeling,
1352 and the cost of misaligned accesses.
1353 - The cost of peeling (the extra runtime checks, the increase
1354 in code size). */
1355
1356 FOR_EACH_VEC_ELT (datarefs, i, dr)
1357 {
1358 stmt = DR_STMT (dr);
1359 stmt_info = vinfo_for_stmt (stmt);
1360
1361 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1362 continue;
1363
1364 /* For interleaving, only the alignment of the first access
1365 matters. */
1366 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1367 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1368 continue;
1369
1370 /* For invariant accesses there is nothing to enhance. */
1371 if (integer_zerop (DR_STEP (dr)))
1372 continue;
1373
1374 /* Strided loads perform only component accesses, alignment is
1375 irrelevant for them. */
1376 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1377 continue;
1378
1379 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1380 do_peeling = vector_alignment_reachable_p (dr);
1381 if (do_peeling)
1382 {
1383 if (known_alignment_for_access_p (dr))
1384 {
1385 unsigned int npeel_tmp;
1386 bool negative = tree_int_cst_compare (DR_STEP (dr),
1387 size_zero_node) < 0;
1388
1389 /* Save info about DR in the hash table. */
1390 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1391 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1392 htab_create (1, vect_peeling_hash,
1393 vect_peeling_hash_eq, free);
1394
1395 vectype = STMT_VINFO_VECTYPE (stmt_info);
1396 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1397 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1398 TREE_TYPE (DR_REF (dr))));
1399 npeel_tmp = (negative
1400 ? (mis - nelements) : (nelements - mis))
1401 & (nelements - 1);
1402
1403 /* For multiple types, it is possible that the bigger type access
1404 will have more than one peeling option. E.g., a loop with two
1405 types: one of size (vector size / 4), and the other one of
1406 size (vector size / 8). Vectorization factor will 8. If both
1407 access are misaligned by 3, the first one needs one scalar
1408 iteration to be aligned, and the second one needs 5. But the
1409 the first one will be aligned also by peeling 5 scalar
1410 iterations, and in that case both accesses will be aligned.
1411 Hence, except for the immediate peeling amount, we also want
1412 to try to add full vector size, while we don't exceed
1413 vectorization factor.
1414 We do this automtically for cost model, since we calculate cost
1415 for every peeling option. */
1416 if (!flag_vect_cost_model)
1417 possible_npeel_number = vf /nelements;
1418
1419 /* Handle the aligned case. We may decide to align some other
1420 access, making DR unaligned. */
1421 if (DR_MISALIGNMENT (dr) == 0)
1422 {
1423 npeel_tmp = 0;
1424 if (!flag_vect_cost_model)
1425 possible_npeel_number++;
1426 }
1427
1428 for (j = 0; j < possible_npeel_number; j++)
1429 {
1430 gcc_assert (npeel_tmp <= vf);
1431 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1432 npeel_tmp += nelements;
1433 }
1434
1435 all_misalignments_unknown = false;
1436 /* Data-ref that was chosen for the case that all the
1437 misalignments are unknown is not relevant anymore, since we
1438 have a data-ref with known alignment. */
1439 dr0 = NULL;
1440 }
1441 else
1442 {
1443 /* If we don't know all the misalignment values, we prefer
1444 peeling for data-ref that has maximum number of data-refs
1445 with the same alignment, unless the target prefers to align
1446 stores over load. */
1447 if (all_misalignments_unknown)
1448 {
1449 if (same_align_drs_max
1450 < STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ()
1451 || !dr0)
1452 {
1453 same_align_drs_max
1454 = STMT_VINFO_SAME_ALIGN_REFS (stmt_info).length ();
1455 dr0 = dr;
1456 }
1457
1458 if (!first_store && DR_IS_WRITE (dr))
1459 first_store = dr;
1460 }
1461
1462 /* If there are both known and unknown misaligned accesses in the
1463 loop, we choose peeling amount according to the known
1464 accesses. */
1465
1466
1467 if (!supportable_dr_alignment)
1468 {
1469 dr0 = dr;
1470 if (!first_store && DR_IS_WRITE (dr))
1471 first_store = dr;
1472 }
1473 }
1474 }
1475 else
1476 {
1477 if (!aligned_access_p (dr))
1478 {
1479 if (dump_enabled_p ())
1480 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1481 "vector alignment may not be reachable");
1482 break;
1483 }
1484 }
1485 }
1486
1487 vect_versioning_for_alias_required
1488 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1489
1490 /* Temporarily, if versioning for alias is required, we disable peeling
1491 until we support peeling and versioning. Often peeling for alignment
1492 will require peeling for loop-bound, which in turn requires that we
1493 know how to adjust the loop ivs after the loop. */
1494 if (vect_versioning_for_alias_required
1495 || !vect_can_advance_ivs_p (loop_vinfo)
1496 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1497 do_peeling = false;
1498
1499 if (do_peeling && all_misalignments_unknown
1500 && vect_supportable_dr_alignment (dr0, false))
1501 {
1502
1503 /* Check if the target requires to prefer stores over loads, i.e., if
1504 misaligned stores are more expensive than misaligned loads (taking
1505 drs with same alignment into account). */
1506 if (first_store && DR_IS_READ (dr0))
1507 {
1508 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1509 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1510 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1511 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1512 stmt_vector_for_cost dummy;
1513 dummy.create (2);
1514
1515 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1516 &dummy);
1517 vect_get_data_access_cost (first_store, &store_inside_cost,
1518 &store_outside_cost, &dummy);
1519
1520 dummy.release ();
1521
1522 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1523 aligning the load DR0). */
1524 load_inside_penalty = store_inside_cost;
1525 load_outside_penalty = store_outside_cost;
1526 for (i = 0;
1527 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1528 DR_STMT (first_store))).iterate (i, &dr);
1529 i++)
1530 if (DR_IS_READ (dr))
1531 {
1532 load_inside_penalty += load_inside_cost;
1533 load_outside_penalty += load_outside_cost;
1534 }
1535 else
1536 {
1537 load_inside_penalty += store_inside_cost;
1538 load_outside_penalty += store_outside_cost;
1539 }
1540
1541 /* Calculate the penalty for leaving DR0 unaligned (by
1542 aligning the FIRST_STORE). */
1543 store_inside_penalty = load_inside_cost;
1544 store_outside_penalty = load_outside_cost;
1545 for (i = 0;
1546 STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (
1547 DR_STMT (dr0))).iterate (i, &dr);
1548 i++)
1549 if (DR_IS_READ (dr))
1550 {
1551 store_inside_penalty += load_inside_cost;
1552 store_outside_penalty += load_outside_cost;
1553 }
1554 else
1555 {
1556 store_inside_penalty += store_inside_cost;
1557 store_outside_penalty += store_outside_cost;
1558 }
1559
1560 if (load_inside_penalty > store_inside_penalty
1561 || (load_inside_penalty == store_inside_penalty
1562 && load_outside_penalty > store_outside_penalty))
1563 dr0 = first_store;
1564 }
1565
1566 /* In case there are only loads with different unknown misalignments, use
1567 peeling only if it may help to align other accesses in the loop. */
1568 if (!first_store
1569 && !STMT_VINFO_SAME_ALIGN_REFS (
1570 vinfo_for_stmt (DR_STMT (dr0))).length ()
1571 && vect_supportable_dr_alignment (dr0, false)
1572 != dr_unaligned_supported)
1573 do_peeling = false;
1574 }
1575
1576 if (do_peeling && !dr0)
1577 {
1578 /* Peeling is possible, but there is no data access that is not supported
1579 unless aligned. So we try to choose the best possible peeling. */
1580
1581 /* We should get here only if there are drs with known misalignment. */
1582 gcc_assert (!all_misalignments_unknown);
1583
1584 /* Choose the best peeling from the hash table. */
1585 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1586 &body_cost_vec);
1587 if (!dr0 || !npeel)
1588 do_peeling = false;
1589 }
1590
1591 if (do_peeling)
1592 {
1593 stmt = DR_STMT (dr0);
1594 stmt_info = vinfo_for_stmt (stmt);
1595 vectype = STMT_VINFO_VECTYPE (stmt_info);
1596 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1597
1598 if (known_alignment_for_access_p (dr0))
1599 {
1600 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1601 size_zero_node) < 0;
1602 if (!npeel)
1603 {
1604 /* Since it's known at compile time, compute the number of
1605 iterations in the peeled loop (the peeling factor) for use in
1606 updating DR_MISALIGNMENT values. The peeling factor is the
1607 vectorization factor minus the misalignment as an element
1608 count. */
1609 mis = DR_MISALIGNMENT (dr0);
1610 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1611 npeel = ((negative ? mis - nelements : nelements - mis)
1612 & (nelements - 1));
1613 }
1614
1615 /* For interleaved data access every iteration accesses all the
1616 members of the group, therefore we divide the number of iterations
1617 by the group size. */
1618 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1619 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1620 npeel /= GROUP_SIZE (stmt_info);
1621
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_NOTE, vect_location,
1624 "Try peeling by %d", npeel);
1625 }
1626
1627 /* Ensure that all data refs can be vectorized after the peel. */
1628 FOR_EACH_VEC_ELT (datarefs, i, dr)
1629 {
1630 int save_misalignment;
1631
1632 if (dr == dr0)
1633 continue;
1634
1635 stmt = DR_STMT (dr);
1636 stmt_info = vinfo_for_stmt (stmt);
1637 /* For interleaving, only the alignment of the first access
1638 matters. */
1639 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1640 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1641 continue;
1642
1643 /* Strided loads perform only component accesses, alignment is
1644 irrelevant for them. */
1645 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1646 continue;
1647
1648 save_misalignment = DR_MISALIGNMENT (dr);
1649 vect_update_misalignment_for_peel (dr, dr0, npeel);
1650 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1651 SET_DR_MISALIGNMENT (dr, save_misalignment);
1652
1653 if (!supportable_dr_alignment)
1654 {
1655 do_peeling = false;
1656 break;
1657 }
1658 }
1659
1660 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1661 {
1662 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1663 if (!stat)
1664 do_peeling = false;
1665 else
1666 {
1667 body_cost_vec.release ();
1668 return stat;
1669 }
1670 }
1671
1672 if (do_peeling)
1673 {
1674 stmt_info_for_cost *si;
1675 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1676
1677 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1678 If the misalignment of DR_i is identical to that of dr0 then set
1679 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1680 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1681 by the peeling factor times the element size of DR_i (MOD the
1682 vectorization factor times the size). Otherwise, the
1683 misalignment of DR_i must be set to unknown. */
1684 FOR_EACH_VEC_ELT (datarefs, i, dr)
1685 if (dr != dr0)
1686 vect_update_misalignment_for_peel (dr, dr0, npeel);
1687
1688 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1689 if (npeel)
1690 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1691 else
1692 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1693 SET_DR_MISALIGNMENT (dr0, 0);
1694 if (dump_enabled_p ())
1695 {
1696 dump_printf_loc (MSG_NOTE, vect_location,
1697 "Alignment of access forced using peeling.");
1698 dump_printf_loc (MSG_NOTE, vect_location,
1699 "Peeling for alignment will be applied.");
1700 }
1701 /* We've delayed passing the inside-loop peeling costs to the
1702 target cost model until we were sure peeling would happen.
1703 Do so now. */
1704 if (body_cost_vec.exists ())
1705 {
1706 FOR_EACH_VEC_ELT (body_cost_vec, i, si)
1707 {
1708 struct _stmt_vec_info *stmt_info
1709 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1710 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1711 si->misalign, vect_body);
1712 }
1713 body_cost_vec.release ();
1714 }
1715
1716 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1717 gcc_assert (stat);
1718 return stat;
1719 }
1720 }
1721
1722 body_cost_vec.release ();
1723
1724 /* (2) Versioning to force alignment. */
1725
1726 /* Try versioning if:
1727 1) flag_tree_vect_loop_version is TRUE
1728 2) optimize loop for speed
1729 3) there is at least one unsupported misaligned data ref with an unknown
1730 misalignment, and
1731 4) all misaligned data refs with a known misalignment are supported, and
1732 5) the number of runtime alignment checks is within reason. */
1733
1734 do_versioning =
1735 flag_tree_vect_loop_version
1736 && optimize_loop_nest_for_speed_p (loop)
1737 && (!loop->inner); /* FORNOW */
1738
1739 if (do_versioning)
1740 {
1741 FOR_EACH_VEC_ELT (datarefs, i, dr)
1742 {
1743 stmt = DR_STMT (dr);
1744 stmt_info = vinfo_for_stmt (stmt);
1745
1746 /* For interleaving, only the alignment of the first access
1747 matters. */
1748 if (aligned_access_p (dr)
1749 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1750 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
1751 continue;
1752
1753 /* Strided loads perform only component accesses, alignment is
1754 irrelevant for them. */
1755 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1756 continue;
1757
1758 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1759
1760 if (!supportable_dr_alignment)
1761 {
1762 gimple stmt;
1763 int mask;
1764 tree vectype;
1765
1766 if (known_alignment_for_access_p (dr)
1767 || LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ()
1768 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
1769 {
1770 do_versioning = false;
1771 break;
1772 }
1773
1774 stmt = DR_STMT (dr);
1775 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
1776 gcc_assert (vectype);
1777
1778 /* The rightmost bits of an aligned address must be zeros.
1779 Construct the mask needed for this test. For example,
1780 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
1781 mask must be 15 = 0xf. */
1782 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
1783
1784 /* FORNOW: use the same mask to test all potentially unaligned
1785 references in the loop. The vectorizer currently supports
1786 a single vector size, see the reference to
1787 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
1788 vectorization factor is computed. */
1789 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
1790 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
1791 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
1792 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).safe_push (
1793 DR_STMT (dr));
1794 }
1795 }
1796
1797 /* Versioning requires at least one misaligned data reference. */
1798 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1799 do_versioning = false;
1800 else if (!do_versioning)
1801 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).truncate (0);
1802 }
1803
1804 if (do_versioning)
1805 {
1806 vec<gimple> may_misalign_stmts
1807 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
1808 gimple stmt;
1809
1810 /* It can now be assumed that the data references in the statements
1811 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
1812 of the loop being vectorized. */
1813 FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt)
1814 {
1815 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1816 dr = STMT_VINFO_DATA_REF (stmt_info);
1817 SET_DR_MISALIGNMENT (dr, 0);
1818 if (dump_enabled_p ())
1819 dump_printf_loc (MSG_NOTE, vect_location,
1820 "Alignment of access forced using versioning.");
1821 }
1822
1823 if (dump_enabled_p ())
1824 dump_printf_loc (MSG_NOTE, vect_location,
1825 "Versioning for alignment will be applied.");
1826
1827 /* Peeling and versioning can't be done together at this time. */
1828 gcc_assert (! (do_peeling && do_versioning));
1829
1830 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1831 gcc_assert (stat);
1832 return stat;
1833 }
1834
1835 /* This point is reached if neither peeling nor versioning is being done. */
1836 gcc_assert (! (do_peeling || do_versioning));
1837
1838 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1839 return stat;
1840 }
1841
1842
1843 /* Function vect_find_same_alignment_drs.
1844
1845 Update group and alignment relations according to the chosen
1846 vectorization factor. */
1847
1848 static void
1849 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
1850 loop_vec_info loop_vinfo)
1851 {
1852 unsigned int i;
1853 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1854 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1855 struct data_reference *dra = DDR_A (ddr);
1856 struct data_reference *drb = DDR_B (ddr);
1857 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
1858 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
1859 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
1860 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
1861 lambda_vector dist_v;
1862 unsigned int loop_depth;
1863
1864 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
1865 return;
1866
1867 if (dra == drb)
1868 return;
1869
1870 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
1871 return;
1872
1873 /* Loop-based vectorization and known data dependence. */
1874 if (DDR_NUM_DIST_VECTS (ddr) == 0)
1875 return;
1876
1877 /* Data-dependence analysis reports a distance vector of zero
1878 for data-references that overlap only in the first iteration
1879 but have different sign step (see PR45764).
1880 So as a sanity check require equal DR_STEP. */
1881 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
1882 return;
1883
1884 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
1885 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
1886 {
1887 int dist = dist_v[loop_depth];
1888
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_NOTE, vect_location,
1891 "dependence distance = %d.", dist);
1892
1893 /* Same loop iteration. */
1894 if (dist == 0
1895 || (dist % vectorization_factor == 0 && dra_size == drb_size))
1896 {
1897 /* Two references with distance zero have the same alignment. */
1898 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a).safe_push (drb);
1899 STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b).safe_push (dra);
1900 if (dump_enabled_p ())
1901 {
1902 dump_printf_loc (MSG_NOTE, vect_location,
1903 "accesses have the same alignment.");
1904 dump_printf (MSG_NOTE,
1905 "dependence distance modulo vf == 0 between ");
1906 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
1907 dump_printf (MSG_NOTE, " and ");
1908 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
1909 }
1910 }
1911 }
1912 }
1913
1914
1915 /* Function vect_analyze_data_refs_alignment
1916
1917 Analyze the alignment of the data-references in the loop.
1918 Return FALSE if a data reference is found that cannot be vectorized. */
1919
1920 bool
1921 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
1922 bb_vec_info bb_vinfo)
1923 {
1924 if (dump_enabled_p ())
1925 dump_printf_loc (MSG_NOTE, vect_location,
1926 "=== vect_analyze_data_refs_alignment ===");
1927
1928 /* Mark groups of data references with same alignment using
1929 data dependence information. */
1930 if (loop_vinfo)
1931 {
1932 vec<ddr_p> ddrs = LOOP_VINFO_DDRS (loop_vinfo);
1933 struct data_dependence_relation *ddr;
1934 unsigned int i;
1935
1936 FOR_EACH_VEC_ELT (ddrs, i, ddr)
1937 vect_find_same_alignment_drs (ddr, loop_vinfo);
1938 }
1939
1940 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
1941 {
1942 if (dump_enabled_p ())
1943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944 "not vectorized: can't calculate alignment "
1945 "for data ref.");
1946 return false;
1947 }
1948
1949 return true;
1950 }
1951
1952
1953 /* Analyze groups of accesses: check that DR belongs to a group of
1954 accesses of legal size, step, etc. Detect gaps, single element
1955 interleaving, and other special cases. Set grouped access info.
1956 Collect groups of strided stores for further use in SLP analysis. */
1957
1958 static bool
1959 vect_analyze_group_access (struct data_reference *dr)
1960 {
1961 tree step = DR_STEP (dr);
1962 tree scalar_type = TREE_TYPE (DR_REF (dr));
1963 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
1964 gimple stmt = DR_STMT (dr);
1965 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1966 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1967 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
1968 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
1969 HOST_WIDE_INT groupsize, last_accessed_element = 1;
1970 bool slp_impossible = false;
1971 struct loop *loop = NULL;
1972
1973 if (loop_vinfo)
1974 loop = LOOP_VINFO_LOOP (loop_vinfo);
1975
1976 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
1977 size of the interleaving group (including gaps). */
1978 groupsize = dr_step / type_size;
1979
1980 /* Not consecutive access is possible only if it is a part of interleaving. */
1981 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
1982 {
1983 /* Check if it this DR is a part of interleaving, and is a single
1984 element of the group that is accessed in the loop. */
1985
1986 /* Gaps are supported only for loads. STEP must be a multiple of the type
1987 size. The size of the group must be a power of 2. */
1988 if (DR_IS_READ (dr)
1989 && (dr_step % type_size) == 0
1990 && groupsize > 0
1991 && exact_log2 (groupsize) != -1)
1992 {
1993 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
1994 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
1995 if (dump_enabled_p ())
1996 {
1997 dump_printf_loc (MSG_NOTE, vect_location,
1998 "Detected single element interleaving ");
1999 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2000 dump_printf (MSG_NOTE, " step ");
2001 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2002 }
2003
2004 if (loop_vinfo)
2005 {
2006 if (dump_enabled_p ())
2007 dump_printf_loc (MSG_NOTE, vect_location,
2008 "Data access with gaps requires scalar "
2009 "epilogue loop");
2010 if (loop->inner)
2011 {
2012 if (dump_enabled_p ())
2013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2014 "Peeling for outer loop is not"
2015 " supported");
2016 return false;
2017 }
2018
2019 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2020 }
2021
2022 return true;
2023 }
2024
2025 if (dump_enabled_p ())
2026 {
2027 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2028 "not consecutive access ");
2029 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2030 }
2031
2032 if (bb_vinfo)
2033 {
2034 /* Mark the statement as unvectorizable. */
2035 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2036 return true;
2037 }
2038
2039 return false;
2040 }
2041
2042 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2043 {
2044 /* First stmt in the interleaving chain. Check the chain. */
2045 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2046 struct data_reference *data_ref = dr;
2047 unsigned int count = 1;
2048 tree next_step;
2049 tree prev_init = DR_INIT (data_ref);
2050 gimple prev = stmt;
2051 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2052
2053 while (next)
2054 {
2055 /* Skip same data-refs. In case that two or more stmts share
2056 data-ref (supported only for loads), we vectorize only the first
2057 stmt, and the rest get their vectorized loads from the first
2058 one. */
2059 if (!tree_int_cst_compare (DR_INIT (data_ref),
2060 DR_INIT (STMT_VINFO_DATA_REF (
2061 vinfo_for_stmt (next)))))
2062 {
2063 if (DR_IS_WRITE (data_ref))
2064 {
2065 if (dump_enabled_p ())
2066 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2067 "Two store stmts share the same dr.");
2068 return false;
2069 }
2070
2071 /* Check that there is no load-store dependencies for this loads
2072 to prevent a case of load-store-load to the same location. */
2073 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2074 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2075 {
2076 if (dump_enabled_p ())
2077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078 "READ_WRITE dependence in interleaving.");
2079 return false;
2080 }
2081
2082 /* For load use the same data-ref load. */
2083 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2084
2085 prev = next;
2086 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2087 continue;
2088 }
2089
2090 prev = next;
2091
2092 /* Check that all the accesses have the same STEP. */
2093 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2094 if (tree_int_cst_compare (step, next_step))
2095 {
2096 if (dump_enabled_p ())
2097 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098 "not consecutive access in interleaving");
2099 return false;
2100 }
2101
2102 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2103 /* Check that the distance between two accesses is equal to the type
2104 size. Otherwise, we have gaps. */
2105 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2106 - TREE_INT_CST_LOW (prev_init)) / type_size;
2107 if (diff != 1)
2108 {
2109 /* FORNOW: SLP of accesses with gaps is not supported. */
2110 slp_impossible = true;
2111 if (DR_IS_WRITE (data_ref))
2112 {
2113 if (dump_enabled_p ())
2114 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2115 "interleaved store with gaps");
2116 return false;
2117 }
2118
2119 gaps += diff - 1;
2120 }
2121
2122 last_accessed_element += diff;
2123
2124 /* Store the gap from the previous member of the group. If there is no
2125 gap in the access, GROUP_GAP is always 1. */
2126 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2127
2128 prev_init = DR_INIT (data_ref);
2129 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2130 /* Count the number of data-refs in the chain. */
2131 count++;
2132 }
2133
2134 /* COUNT is the number of accesses found, we multiply it by the size of
2135 the type to get COUNT_IN_BYTES. */
2136 count_in_bytes = type_size * count;
2137
2138 /* Check that the size of the interleaving (including gaps) is not
2139 greater than STEP. */
2140 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2141 {
2142 if (dump_enabled_p ())
2143 {
2144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2145 "interleaving size is greater than step for ");
2146 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2147 }
2148 return false;
2149 }
2150
2151 /* Check that the size of the interleaving is equal to STEP for stores,
2152 i.e., that there are no gaps. */
2153 if (dr_step && dr_step != count_in_bytes)
2154 {
2155 if (DR_IS_READ (dr))
2156 {
2157 slp_impossible = true;
2158 /* There is a gap after the last load in the group. This gap is a
2159 difference between the groupsize and the number of elements.
2160 When there is no gap, this difference should be 0. */
2161 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2162 }
2163 else
2164 {
2165 if (dump_enabled_p ())
2166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167 "interleaved store with gaps");
2168 return false;
2169 }
2170 }
2171
2172 /* Check that STEP is a multiple of type size. */
2173 if (dr_step && (dr_step % type_size) != 0)
2174 {
2175 if (dump_enabled_p ())
2176 {
2177 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2178 "step is not a multiple of type size: step ");
2179 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2180 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2181 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2182 TYPE_SIZE_UNIT (scalar_type));
2183 }
2184 return false;
2185 }
2186
2187 if (groupsize == 0)
2188 groupsize = count;
2189
2190 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2191 if (dump_enabled_p ())
2192 dump_printf_loc (MSG_NOTE, vect_location,
2193 "Detected interleaving of size %d", (int)groupsize);
2194
2195 /* SLP: create an SLP data structure for every interleaving group of
2196 stores for further analysis in vect_analyse_slp. */
2197 if (DR_IS_WRITE (dr) && !slp_impossible)
2198 {
2199 if (loop_vinfo)
2200 LOOP_VINFO_GROUPED_STORES (loop_vinfo).safe_push (stmt);
2201 if (bb_vinfo)
2202 BB_VINFO_GROUPED_STORES (bb_vinfo).safe_push (stmt);
2203 }
2204
2205 /* There is a gap in the end of the group. */
2206 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2207 {
2208 if (dump_enabled_p ())
2209 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2210 "Data access with gaps requires scalar "
2211 "epilogue loop");
2212 if (loop->inner)
2213 {
2214 if (dump_enabled_p ())
2215 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2216 "Peeling for outer loop is not supported");
2217 return false;
2218 }
2219
2220 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2221 }
2222 }
2223
2224 return true;
2225 }
2226
2227
2228 /* Analyze the access pattern of the data-reference DR.
2229 In case of non-consecutive accesses call vect_analyze_group_access() to
2230 analyze groups of accesses. */
2231
2232 static bool
2233 vect_analyze_data_ref_access (struct data_reference *dr)
2234 {
2235 tree step = DR_STEP (dr);
2236 tree scalar_type = TREE_TYPE (DR_REF (dr));
2237 gimple stmt = DR_STMT (dr);
2238 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2239 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2240 struct loop *loop = NULL;
2241
2242 if (loop_vinfo)
2243 loop = LOOP_VINFO_LOOP (loop_vinfo);
2244
2245 if (loop_vinfo && !step)
2246 {
2247 if (dump_enabled_p ())
2248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249 "bad data-ref access in loop");
2250 return false;
2251 }
2252
2253 /* Allow invariant loads in loops. */
2254 if (loop_vinfo && integer_zerop (step))
2255 {
2256 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2257 return DR_IS_READ (dr);
2258 }
2259
2260 if (loop && nested_in_vect_loop_p (loop, stmt))
2261 {
2262 /* Interleaved accesses are not yet supported within outer-loop
2263 vectorization for references in the inner-loop. */
2264 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2265
2266 /* For the rest of the analysis we use the outer-loop step. */
2267 step = STMT_VINFO_DR_STEP (stmt_info);
2268 if (integer_zerop (step))
2269 {
2270 if (dump_enabled_p ())
2271 dump_printf_loc (MSG_NOTE, vect_location,
2272 "zero step in outer loop.");
2273 if (DR_IS_READ (dr))
2274 return true;
2275 else
2276 return false;
2277 }
2278 }
2279
2280 /* Consecutive? */
2281 if (TREE_CODE (step) == INTEGER_CST)
2282 {
2283 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2284 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2285 || (dr_step < 0
2286 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2287 {
2288 /* Mark that it is not interleaving. */
2289 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2290 return true;
2291 }
2292 }
2293
2294 if (loop && nested_in_vect_loop_p (loop, stmt))
2295 {
2296 if (dump_enabled_p ())
2297 dump_printf_loc (MSG_NOTE, vect_location,
2298 "grouped access in outer loop.");
2299 return false;
2300 }
2301
2302 /* Assume this is a DR handled by non-constant strided load case. */
2303 if (TREE_CODE (step) != INTEGER_CST)
2304 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2305
2306 /* Not consecutive access - check if it's a part of interleaving group. */
2307 return vect_analyze_group_access (dr);
2308 }
2309
2310 /* Compare two data-references DRA and DRB to group them into chunks
2311 suitable for grouping. */
2312
2313 static int
2314 dr_group_sort_cmp (const void *dra_, const void *drb_)
2315 {
2316 data_reference_p dra = *(data_reference_p *)const_cast<void *>(dra_);
2317 data_reference_p drb = *(data_reference_p *)const_cast<void *>(drb_);
2318 hashval_t h1, h2;
2319 int cmp;
2320
2321 /* Stabilize sort. */
2322 if (dra == drb)
2323 return 0;
2324
2325 /* Ordering of DRs according to base. */
2326 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0))
2327 {
2328 h1 = iterative_hash_expr (DR_BASE_ADDRESS (dra), 0);
2329 h2 = iterative_hash_expr (DR_BASE_ADDRESS (drb), 0);
2330 if (h1 != h2)
2331 return h1 < h2 ? -1 : 1;
2332 }
2333
2334 /* And according to DR_OFFSET. */
2335 if (!dr_equal_offsets_p (dra, drb))
2336 {
2337 h1 = iterative_hash_expr (DR_OFFSET (dra), 0);
2338 h2 = iterative_hash_expr (DR_OFFSET (drb), 0);
2339 if (h1 != h2)
2340 return h1 < h2 ? -1 : 1;
2341 }
2342
2343 /* Put reads before writes. */
2344 if (DR_IS_READ (dra) != DR_IS_READ (drb))
2345 return DR_IS_READ (dra) ? -1 : 1;
2346
2347 /* Then sort after access size. */
2348 if (!operand_equal_p (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))),
2349 TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0))
2350 {
2351 h1 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))), 0);
2352 h2 = iterative_hash_expr (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))), 0);
2353 if (h1 != h2)
2354 return h1 < h2 ? -1 : 1;
2355 }
2356
2357 /* And after step. */
2358 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2359 {
2360 h1 = iterative_hash_expr (DR_STEP (dra), 0);
2361 h2 = iterative_hash_expr (DR_STEP (drb), 0);
2362 if (h1 != h2)
2363 return h1 < h2 ? -1 : 1;
2364 }
2365
2366 /* Then sort after DR_INIT. In case of identical DRs sort after stmt UID. */
2367 cmp = tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb));
2368 if (cmp == 0)
2369 return gimple_uid (DR_STMT (dra)) < gimple_uid (DR_STMT (drb)) ? -1 : 1;
2370 return cmp;
2371 }
2372
2373 /* Function vect_analyze_data_ref_accesses.
2374
2375 Analyze the access pattern of all the data references in the loop.
2376
2377 FORNOW: the only access pattern that is considered vectorizable is a
2378 simple step 1 (consecutive) access.
2379
2380 FORNOW: handle only arrays and pointer accesses. */
2381
2382 bool
2383 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2384 {
2385 unsigned int i;
2386 vec<data_reference_p> datarefs;
2387 struct data_reference *dr;
2388
2389 if (dump_enabled_p ())
2390 dump_printf_loc (MSG_NOTE, vect_location,
2391 "=== vect_analyze_data_ref_accesses ===");
2392
2393 if (loop_vinfo)
2394 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2395 else
2396 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2397
2398 if (datarefs.is_empty ())
2399 return true;
2400
2401 /* Sort the array of datarefs to make building the interleaving chains
2402 linear. */
2403 qsort (datarefs.address(), datarefs.length (),
2404 sizeof (data_reference_p), dr_group_sort_cmp);
2405
2406 /* Build the interleaving chains. */
2407 for (i = 0; i < datarefs.length () - 1;)
2408 {
2409 data_reference_p dra = datarefs[i];
2410 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2411 stmt_vec_info lastinfo = NULL;
2412 for (i = i + 1; i < datarefs.length (); ++i)
2413 {
2414 data_reference_p drb = datarefs[i];
2415 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2416
2417 /* ??? Imperfect sorting (non-compatible types, non-modulo
2418 accesses, same accesses) can lead to a group to be artificially
2419 split here as we don't just skip over those. If it really
2420 matters we can push those to a worklist and re-iterate
2421 over them. The we can just skip ahead to the next DR here. */
2422
2423 /* Check that the data-refs have same first location (except init)
2424 and they are both either store or load (not load and store). */
2425 if (DR_IS_READ (dra) != DR_IS_READ (drb)
2426 || !operand_equal_p (DR_BASE_ADDRESS (dra),
2427 DR_BASE_ADDRESS (drb), 0)
2428 || !dr_equal_offsets_p (dra, drb))
2429 break;
2430
2431 /* Check that the data-refs have the same constant size and step. */
2432 tree sza = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra)));
2433 tree szb = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb)));
2434 if (!host_integerp (sza, 1)
2435 || !host_integerp (szb, 1)
2436 || !tree_int_cst_equal (sza, szb)
2437 || !host_integerp (DR_STEP (dra), 0)
2438 || !host_integerp (DR_STEP (drb), 0)
2439 || !tree_int_cst_equal (DR_STEP (dra), DR_STEP (drb)))
2440 break;
2441
2442 /* Do not place the same access in the interleaving chain twice. */
2443 if (tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb)) == 0)
2444 break;
2445
2446 /* Check the types are compatible.
2447 ??? We don't distinguish this during sorting. */
2448 if (!types_compatible_p (TREE_TYPE (DR_REF (dra)),
2449 TREE_TYPE (DR_REF (drb))))
2450 break;
2451
2452 /* Sorting has ensured that DR_INIT (dra) <= DR_INIT (drb). */
2453 HOST_WIDE_INT init_a = TREE_INT_CST_LOW (DR_INIT (dra));
2454 HOST_WIDE_INT init_b = TREE_INT_CST_LOW (DR_INIT (drb));
2455 gcc_assert (init_a < init_b);
2456
2457 /* If init_b == init_a + the size of the type * k, we have an
2458 interleaving, and DRA is accessed before DRB. */
2459 HOST_WIDE_INT type_size_a = TREE_INT_CST_LOW (sza);
2460 if ((init_b - init_a) % type_size_a != 0)
2461 break;
2462
2463 /* The step (if not zero) is greater than the difference between
2464 data-refs' inits. This splits groups into suitable sizes. */
2465 HOST_WIDE_INT step = TREE_INT_CST_LOW (DR_STEP (dra));
2466 if (step != 0 && step <= (init_b - init_a))
2467 break;
2468
2469 if (dump_enabled_p ())
2470 {
2471 dump_printf_loc (MSG_NOTE, vect_location,
2472 "Detected interleaving ");
2473 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2474 dump_printf (MSG_NOTE, " and ");
2475 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2476 }
2477
2478 /* Link the found element into the group list. */
2479 if (!GROUP_FIRST_ELEMENT (stmtinfo_a))
2480 {
2481 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (dra);
2482 lastinfo = stmtinfo_a;
2483 }
2484 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (dra);
2485 GROUP_NEXT_ELEMENT (lastinfo) = DR_STMT (drb);
2486 lastinfo = stmtinfo_b;
2487 }
2488 }
2489
2490 FOR_EACH_VEC_ELT (datarefs, i, dr)
2491 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2492 && !vect_analyze_data_ref_access (dr))
2493 {
2494 if (dump_enabled_p ())
2495 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2496 "not vectorized: complicated access pattern.");
2497
2498 if (bb_vinfo)
2499 {
2500 /* Mark the statement as not vectorizable. */
2501 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2502 continue;
2503 }
2504 else
2505 return false;
2506 }
2507
2508 return true;
2509 }
2510
2511 /* Function vect_prune_runtime_alias_test_list.
2512
2513 Prune a list of ddrs to be tested at run-time by versioning for alias.
2514 Return FALSE if resulting list of ddrs is longer then allowed by
2515 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2516
2517 bool
2518 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2519 {
2520 vec<ddr_p> ddrs =
2521 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2522 unsigned i, j;
2523
2524 if (dump_enabled_p ())
2525 dump_printf_loc (MSG_NOTE, vect_location,
2526 "=== vect_prune_runtime_alias_test_list ===");
2527
2528 for (i = 0; i < ddrs.length (); )
2529 {
2530 bool found;
2531 ddr_p ddr_i;
2532
2533 ddr_i = ddrs[i];
2534 found = false;
2535
2536 for (j = 0; j < i; j++)
2537 {
2538 ddr_p ddr_j = ddrs[j];
2539
2540 if (vect_vfa_range_equal (ddr_i, ddr_j))
2541 {
2542 if (dump_enabled_p ())
2543 {
2544 dump_printf_loc (MSG_NOTE, vect_location,
2545 "found equal ranges ");
2546 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2547 dump_printf (MSG_NOTE, ", ");
2548 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2549 dump_printf (MSG_NOTE, " and ");
2550 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2551 dump_printf (MSG_NOTE, ", ");
2552 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2553 }
2554 found = true;
2555 break;
2556 }
2557 }
2558
2559 if (found)
2560 {
2561 ddrs.ordered_remove (i);
2562 continue;
2563 }
2564 i++;
2565 }
2566
2567 if (ddrs.length () >
2568 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2569 {
2570 if (dump_enabled_p ())
2571 {
2572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2573 "disable versioning for alias - max number of "
2574 "generated checks exceeded.");
2575 }
2576
2577 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).truncate (0);
2578
2579 return false;
2580 }
2581
2582 return true;
2583 }
2584
2585 /* Check whether a non-affine read in stmt is suitable for gather load
2586 and if so, return a builtin decl for that operation. */
2587
2588 tree
2589 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2590 tree *offp, int *scalep)
2591 {
2592 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2593 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2594 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2595 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2596 tree offtype = NULL_TREE;
2597 tree decl, base, off;
2598 enum machine_mode pmode;
2599 int punsignedp, pvolatilep;
2600
2601 /* The gather builtins need address of the form
2602 loop_invariant + vector * {1, 2, 4, 8}
2603 or
2604 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2605 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2606 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2607 multiplications and additions in it. To get a vector, we need
2608 a single SSA_NAME that will be defined in the loop and will
2609 contain everything that is not loop invariant and that can be
2610 vectorized. The following code attempts to find such a preexistng
2611 SSA_NAME OFF and put the loop invariants into a tree BASE
2612 that can be gimplified before the loop. */
2613 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2614 &pmode, &punsignedp, &pvolatilep, false);
2615 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2616
2617 if (TREE_CODE (base) == MEM_REF)
2618 {
2619 if (!integer_zerop (TREE_OPERAND (base, 1)))
2620 {
2621 if (off == NULL_TREE)
2622 {
2623 double_int moff = mem_ref_offset (base);
2624 off = double_int_to_tree (sizetype, moff);
2625 }
2626 else
2627 off = size_binop (PLUS_EXPR, off,
2628 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2629 }
2630 base = TREE_OPERAND (base, 0);
2631 }
2632 else
2633 base = build_fold_addr_expr (base);
2634
2635 if (off == NULL_TREE)
2636 off = size_zero_node;
2637
2638 /* If base is not loop invariant, either off is 0, then we start with just
2639 the constant offset in the loop invariant BASE and continue with base
2640 as OFF, otherwise give up.
2641 We could handle that case by gimplifying the addition of base + off
2642 into some SSA_NAME and use that as off, but for now punt. */
2643 if (!expr_invariant_in_loop_p (loop, base))
2644 {
2645 if (!integer_zerop (off))
2646 return NULL_TREE;
2647 off = base;
2648 base = size_int (pbitpos / BITS_PER_UNIT);
2649 }
2650 /* Otherwise put base + constant offset into the loop invariant BASE
2651 and continue with OFF. */
2652 else
2653 {
2654 base = fold_convert (sizetype, base);
2655 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2656 }
2657
2658 /* OFF at this point may be either a SSA_NAME or some tree expression
2659 from get_inner_reference. Try to peel off loop invariants from it
2660 into BASE as long as possible. */
2661 STRIP_NOPS (off);
2662 while (offtype == NULL_TREE)
2663 {
2664 enum tree_code code;
2665 tree op0, op1, add = NULL_TREE;
2666
2667 if (TREE_CODE (off) == SSA_NAME)
2668 {
2669 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2670
2671 if (expr_invariant_in_loop_p (loop, off))
2672 return NULL_TREE;
2673
2674 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2675 break;
2676
2677 op0 = gimple_assign_rhs1 (def_stmt);
2678 code = gimple_assign_rhs_code (def_stmt);
2679 op1 = gimple_assign_rhs2 (def_stmt);
2680 }
2681 else
2682 {
2683 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2684 return NULL_TREE;
2685 code = TREE_CODE (off);
2686 extract_ops_from_tree (off, &code, &op0, &op1);
2687 }
2688 switch (code)
2689 {
2690 case POINTER_PLUS_EXPR:
2691 case PLUS_EXPR:
2692 if (expr_invariant_in_loop_p (loop, op0))
2693 {
2694 add = op0;
2695 off = op1;
2696 do_add:
2697 add = fold_convert (sizetype, add);
2698 if (scale != 1)
2699 add = size_binop (MULT_EXPR, add, size_int (scale));
2700 base = size_binop (PLUS_EXPR, base, add);
2701 continue;
2702 }
2703 if (expr_invariant_in_loop_p (loop, op1))
2704 {
2705 add = op1;
2706 off = op0;
2707 goto do_add;
2708 }
2709 break;
2710 case MINUS_EXPR:
2711 if (expr_invariant_in_loop_p (loop, op1))
2712 {
2713 add = fold_convert (sizetype, op1);
2714 add = size_binop (MINUS_EXPR, size_zero_node, add);
2715 off = op0;
2716 goto do_add;
2717 }
2718 break;
2719 case MULT_EXPR:
2720 if (scale == 1 && host_integerp (op1, 0))
2721 {
2722 scale = tree_low_cst (op1, 0);
2723 off = op0;
2724 continue;
2725 }
2726 break;
2727 case SSA_NAME:
2728 off = op0;
2729 continue;
2730 CASE_CONVERT:
2731 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2732 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2733 break;
2734 if (TYPE_PRECISION (TREE_TYPE (op0))
2735 == TYPE_PRECISION (TREE_TYPE (off)))
2736 {
2737 off = op0;
2738 continue;
2739 }
2740 if (TYPE_PRECISION (TREE_TYPE (op0))
2741 < TYPE_PRECISION (TREE_TYPE (off)))
2742 {
2743 off = op0;
2744 offtype = TREE_TYPE (off);
2745 STRIP_NOPS (off);
2746 continue;
2747 }
2748 break;
2749 default:
2750 break;
2751 }
2752 break;
2753 }
2754
2755 /* If at the end OFF still isn't a SSA_NAME or isn't
2756 defined in the loop, punt. */
2757 if (TREE_CODE (off) != SSA_NAME
2758 || expr_invariant_in_loop_p (loop, off))
2759 return NULL_TREE;
2760
2761 if (offtype == NULL_TREE)
2762 offtype = TREE_TYPE (off);
2763
2764 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2765 offtype, scale);
2766 if (decl == NULL_TREE)
2767 return NULL_TREE;
2768
2769 if (basep)
2770 *basep = base;
2771 if (offp)
2772 *offp = off;
2773 if (scalep)
2774 *scalep = scale;
2775 return decl;
2776 }
2777
2778 /* Check wether a non-affine load in STMT (being in the loop referred to
2779 in LOOP_VINFO) is suitable for handling as strided load. That is the case
2780 if its address is a simple induction variable. If so return the base
2781 of that induction variable in *BASEP and the (loop-invariant) step
2782 in *STEPP, both only when that pointer is non-zero.
2783
2784 This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
2785 base pointer) only. */
2786
2787 static bool
2788 vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo)
2789 {
2790 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2791 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2792 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2793 tree base, off;
2794 affine_iv iv;
2795
2796 if (!DR_IS_READ (dr))
2797 return false;
2798
2799 base = DR_REF (dr);
2800
2801 if (TREE_CODE (base) == ARRAY_REF)
2802 {
2803 off = TREE_OPERAND (base, 1);
2804 base = TREE_OPERAND (base, 0);
2805 }
2806 else if (TREE_CODE (base) == MEM_REF)
2807 {
2808 off = TREE_OPERAND (base, 0);
2809 base = TREE_OPERAND (base, 1);
2810 }
2811 else
2812 return false;
2813
2814 if (TREE_CODE (off) != SSA_NAME)
2815 return false;
2816
2817 if (!expr_invariant_in_loop_p (loop, base)
2818 || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
2819 return false;
2820
2821 return true;
2822 }
2823
2824 /* Function vect_analyze_data_refs.
2825
2826 Find all the data references in the loop or basic block.
2827
2828 The general structure of the analysis of data refs in the vectorizer is as
2829 follows:
2830 1- vect_analyze_data_refs(loop/bb): call
2831 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2832 in the loop/bb and their dependences.
2833 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2834 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2835 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2836
2837 */
2838
2839 bool
2840 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2841 bb_vec_info bb_vinfo,
2842 int *min_vf)
2843 {
2844 struct loop *loop = NULL;
2845 basic_block bb = NULL;
2846 unsigned int i;
2847 vec<data_reference_p> datarefs;
2848 struct data_reference *dr;
2849 tree scalar_type;
2850
2851 if (dump_enabled_p ())
2852 dump_printf_loc (MSG_NOTE, vect_location,
2853 "=== vect_analyze_data_refs ===\n");
2854
2855 if (loop_vinfo)
2856 {
2857 loop = LOOP_VINFO_LOOP (loop_vinfo);
2858 if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo))
2859 || find_data_references_in_loop
2860 (loop, &LOOP_VINFO_DATAREFS (loop_vinfo)))
2861 {
2862 if (dump_enabled_p ())
2863 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2864 "not vectorized: loop contains function calls"
2865 " or data references that cannot be analyzed");
2866 return false;
2867 }
2868
2869 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2870 }
2871 else
2872 {
2873 gimple_stmt_iterator gsi;
2874
2875 bb = BB_VINFO_BB (bb_vinfo);
2876 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2877 {
2878 gimple stmt = gsi_stmt (gsi);
2879 if (!find_data_references_in_stmt (NULL, stmt,
2880 &BB_VINFO_DATAREFS (bb_vinfo)))
2881 {
2882 /* Mark the rest of the basic-block as unvectorizable. */
2883 for (; !gsi_end_p (gsi); gsi_next (&gsi))
2884 {
2885 stmt = gsi_stmt (gsi);
2886 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
2887 }
2888 break;
2889 }
2890 }
2891
2892 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2893 }
2894
2895 /* Go through the data-refs, check that the analysis succeeded. Update
2896 pointer from stmt_vec_info struct to DR and vectype. */
2897
2898 FOR_EACH_VEC_ELT (datarefs, i, dr)
2899 {
2900 gimple stmt;
2901 stmt_vec_info stmt_info;
2902 tree base, offset, init;
2903 bool gather = false;
2904 int vf;
2905
2906 if (!dr || !DR_REF (dr))
2907 {
2908 if (dump_enabled_p ())
2909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910 "not vectorized: unhandled data-ref ");
2911 return false;
2912 }
2913
2914 stmt = DR_STMT (dr);
2915 stmt_info = vinfo_for_stmt (stmt);
2916
2917 /* Check that analysis of the data-ref succeeded. */
2918 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
2919 || !DR_STEP (dr))
2920 {
2921 /* If target supports vector gather loads, see if they can't
2922 be used. */
2923 if (loop_vinfo
2924 && DR_IS_READ (dr)
2925 && !TREE_THIS_VOLATILE (DR_REF (dr))
2926 && targetm.vectorize.builtin_gather != NULL
2927 && !nested_in_vect_loop_p (loop, stmt))
2928 {
2929 struct data_reference *newdr
2930 = create_data_ref (NULL, loop_containing_stmt (stmt),
2931 DR_REF (dr), stmt, true);
2932 gcc_assert (newdr != NULL && DR_REF (newdr));
2933 if (DR_BASE_ADDRESS (newdr)
2934 && DR_OFFSET (newdr)
2935 && DR_INIT (newdr)
2936 && DR_STEP (newdr)
2937 && integer_zerop (DR_STEP (newdr)))
2938 {
2939 dr = newdr;
2940 gather = true;
2941 }
2942 else
2943 free_data_ref (newdr);
2944 }
2945
2946 if (!gather)
2947 {
2948 if (dump_enabled_p ())
2949 {
2950 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2951 "not vectorized: data ref analysis "
2952 "failed ");
2953 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2954 }
2955
2956 if (bb_vinfo)
2957 break;
2958
2959 return false;
2960 }
2961 }
2962
2963 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
2964 {
2965 if (dump_enabled_p ())
2966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2967 "not vectorized: base addr of dr is a "
2968 "constant");
2969
2970 if (bb_vinfo)
2971 break;
2972
2973 if (gather)
2974 free_data_ref (dr);
2975 return false;
2976 }
2977
2978 if (TREE_THIS_VOLATILE (DR_REF (dr)))
2979 {
2980 if (dump_enabled_p ())
2981 {
2982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2983 "not vectorized: volatile type ");
2984 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2985 }
2986
2987 if (bb_vinfo)
2988 break;
2989
2990 return false;
2991 }
2992
2993 if (stmt_can_throw_internal (stmt))
2994 {
2995 if (dump_enabled_p ())
2996 {
2997 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998 "not vectorized: statement can throw an "
2999 "exception ");
3000 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3001 }
3002
3003 if (bb_vinfo)
3004 break;
3005
3006 if (gather)
3007 free_data_ref (dr);
3008 return false;
3009 }
3010
3011 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3012 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3013 {
3014 if (dump_enabled_p ())
3015 {
3016 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3017 "not vectorized: statement is bitfield "
3018 "access ");
3019 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3020 }
3021
3022 if (bb_vinfo)
3023 break;
3024
3025 if (gather)
3026 free_data_ref (dr);
3027 return false;
3028 }
3029
3030 base = unshare_expr (DR_BASE_ADDRESS (dr));
3031 offset = unshare_expr (DR_OFFSET (dr));
3032 init = unshare_expr (DR_INIT (dr));
3033
3034 if (is_gimple_call (stmt))
3035 {
3036 if (dump_enabled_p ())
3037 {
3038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3039 "not vectorized: dr in a call ");
3040 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3041 }
3042
3043 if (bb_vinfo)
3044 break;
3045
3046 if (gather)
3047 free_data_ref (dr);
3048 return false;
3049 }
3050
3051 /* Update DR field in stmt_vec_info struct. */
3052
3053 /* If the dataref is in an inner-loop of the loop that is considered for
3054 for vectorization, we also want to analyze the access relative to
3055 the outer-loop (DR contains information only relative to the
3056 inner-most enclosing loop). We do that by building a reference to the
3057 first location accessed by the inner-loop, and analyze it relative to
3058 the outer-loop. */
3059 if (loop && nested_in_vect_loop_p (loop, stmt))
3060 {
3061 tree outer_step, outer_base, outer_init;
3062 HOST_WIDE_INT pbitsize, pbitpos;
3063 tree poffset;
3064 enum machine_mode pmode;
3065 int punsignedp, pvolatilep;
3066 affine_iv base_iv, offset_iv;
3067 tree dinit;
3068
3069 /* Build a reference to the first location accessed by the
3070 inner-loop: *(BASE+INIT). (The first location is actually
3071 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3072 tree inner_base = build_fold_indirect_ref
3073 (fold_build_pointer_plus (base, init));
3074
3075 if (dump_enabled_p ())
3076 {
3077 dump_printf_loc (MSG_NOTE, vect_location,
3078 "analyze in outer-loop: ");
3079 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3080 }
3081
3082 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3083 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3084 gcc_assert (outer_base != NULL_TREE);
3085
3086 if (pbitpos % BITS_PER_UNIT != 0)
3087 {
3088 if (dump_enabled_p ())
3089 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3090 "failed: bit offset alignment.\n");
3091 return false;
3092 }
3093
3094 outer_base = build_fold_addr_expr (outer_base);
3095 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3096 &base_iv, false))
3097 {
3098 if (dump_enabled_p ())
3099 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3100 "failed: evolution of base is not affine.\n");
3101 return false;
3102 }
3103
3104 if (offset)
3105 {
3106 if (poffset)
3107 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3108 poffset);
3109 else
3110 poffset = offset;
3111 }
3112
3113 if (!poffset)
3114 {
3115 offset_iv.base = ssize_int (0);
3116 offset_iv.step = ssize_int (0);
3117 }
3118 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3119 &offset_iv, false))
3120 {
3121 if (dump_enabled_p ())
3122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3123 "evolution of offset is not affine.\n");
3124 return false;
3125 }
3126
3127 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3128 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3129 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3130 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3131 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3132
3133 outer_step = size_binop (PLUS_EXPR,
3134 fold_convert (ssizetype, base_iv.step),
3135 fold_convert (ssizetype, offset_iv.step));
3136
3137 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3138 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3139 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3140 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3141 STMT_VINFO_DR_OFFSET (stmt_info) =
3142 fold_convert (ssizetype, offset_iv.base);
3143 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3144 size_int (highest_pow2_factor (offset_iv.base));
3145
3146 if (dump_enabled_p ())
3147 {
3148 dump_printf_loc (MSG_NOTE, vect_location,
3149 "\touter base_address: ");
3150 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3151 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3152 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3153 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3154 STMT_VINFO_DR_OFFSET (stmt_info));
3155 dump_printf (MSG_NOTE,
3156 "\n\touter constant offset from base address: ");
3157 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3158 STMT_VINFO_DR_INIT (stmt_info));
3159 dump_printf (MSG_NOTE, "\n\touter step: ");
3160 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3161 STMT_VINFO_DR_STEP (stmt_info));
3162 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3163 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3164 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3165 }
3166 }
3167
3168 if (STMT_VINFO_DATA_REF (stmt_info))
3169 {
3170 if (dump_enabled_p ())
3171 {
3172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3173 "not vectorized: more than one data ref "
3174 "in stmt: ");
3175 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3176 }
3177
3178 if (bb_vinfo)
3179 break;
3180
3181 if (gather)
3182 free_data_ref (dr);
3183 return false;
3184 }
3185
3186 STMT_VINFO_DATA_REF (stmt_info) = dr;
3187
3188 /* Set vectype for STMT. */
3189 scalar_type = TREE_TYPE (DR_REF (dr));
3190 STMT_VINFO_VECTYPE (stmt_info) =
3191 get_vectype_for_scalar_type (scalar_type);
3192 if (!STMT_VINFO_VECTYPE (stmt_info))
3193 {
3194 if (dump_enabled_p ())
3195 {
3196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3197 "not vectorized: no vectype for stmt: ");
3198 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3199 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3200 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3201 scalar_type);
3202 }
3203
3204 if (bb_vinfo)
3205 break;
3206
3207 if (gather)
3208 {
3209 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3210 free_data_ref (dr);
3211 }
3212 return false;
3213 }
3214
3215 /* Adjust the minimal vectorization factor according to the
3216 vector type. */
3217 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3218 if (vf > *min_vf)
3219 *min_vf = vf;
3220
3221 if (gather)
3222 {
3223 tree off;
3224
3225 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3226 if (gather
3227 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3228 gather = false;
3229 if (!gather)
3230 {
3231 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3232 free_data_ref (dr);
3233 if (dump_enabled_p ())
3234 {
3235 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3236 "not vectorized: not suitable for gather "
3237 "load ");
3238 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3239 }
3240 return false;
3241 }
3242
3243 datarefs[i] = dr;
3244 STMT_VINFO_GATHER_P (stmt_info) = true;
3245 }
3246 else if (loop_vinfo
3247 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3248 {
3249 bool strided_load = false;
3250 if (!nested_in_vect_loop_p (loop, stmt))
3251 strided_load = vect_check_strided_load (stmt, loop_vinfo);
3252 if (!strided_load)
3253 {
3254 if (dump_enabled_p ())
3255 {
3256 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3257 "not vectorized: not suitable for strided "
3258 "load ");
3259 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3260 }
3261 return false;
3262 }
3263 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3264 }
3265 }
3266
3267 /* If we stopped analysis at the first dataref we could not analyze
3268 when trying to vectorize a basic-block mark the rest of the datarefs
3269 as not vectorizable and truncate the vector of datarefs. That
3270 avoids spending useless time in analyzing their dependence. */
3271 if (i != datarefs.length ())
3272 {
3273 gcc_assert (bb_vinfo != NULL);
3274 for (unsigned j = i; j < datarefs.length (); ++j)
3275 {
3276 data_reference_p dr = datarefs[j];
3277 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
3278 free_data_ref (dr);
3279 }
3280 datarefs.truncate (i);
3281 }
3282
3283 return true;
3284 }
3285
3286
3287 /* Function vect_get_new_vect_var.
3288
3289 Returns a name for a new variable. The current naming scheme appends the
3290 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3291 the name of vectorizer generated variables, and appends that to NAME if
3292 provided. */
3293
3294 tree
3295 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3296 {
3297 const char *prefix;
3298 tree new_vect_var;
3299
3300 switch (var_kind)
3301 {
3302 case vect_simple_var:
3303 prefix = "vect_";
3304 break;
3305 case vect_scalar_var:
3306 prefix = "stmp_";
3307 break;
3308 case vect_pointer_var:
3309 prefix = "vect_p";
3310 break;
3311 default:
3312 gcc_unreachable ();
3313 }
3314
3315 if (name)
3316 {
3317 char* tmp = concat (prefix, name, NULL);
3318 new_vect_var = create_tmp_reg (type, tmp);
3319 free (tmp);
3320 }
3321 else
3322 new_vect_var = create_tmp_reg (type, prefix);
3323
3324 return new_vect_var;
3325 }
3326
3327
3328 /* Function vect_create_addr_base_for_vector_ref.
3329
3330 Create an expression that computes the address of the first memory location
3331 that will be accessed for a data reference.
3332
3333 Input:
3334 STMT: The statement containing the data reference.
3335 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3336 OFFSET: Optional. If supplied, it is be added to the initial address.
3337 LOOP: Specify relative to which loop-nest should the address be computed.
3338 For example, when the dataref is in an inner-loop nested in an
3339 outer-loop that is now being vectorized, LOOP can be either the
3340 outer-loop, or the inner-loop. The first memory location accessed
3341 by the following dataref ('in' points to short):
3342
3343 for (i=0; i<N; i++)
3344 for (j=0; j<M; j++)
3345 s += in[i+j]
3346
3347 is as follows:
3348 if LOOP=i_loop: &in (relative to i_loop)
3349 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3350
3351 Output:
3352 1. Return an SSA_NAME whose value is the address of the memory location of
3353 the first vector of the data reference.
3354 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3355 these statement(s) which define the returned SSA_NAME.
3356
3357 FORNOW: We are only handling array accesses with step 1. */
3358
3359 tree
3360 vect_create_addr_base_for_vector_ref (gimple stmt,
3361 gimple_seq *new_stmt_list,
3362 tree offset,
3363 struct loop *loop)
3364 {
3365 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3366 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3367 tree data_ref_base;
3368 const char *base_name;
3369 tree addr_base;
3370 tree dest;
3371 gimple_seq seq = NULL;
3372 tree base_offset;
3373 tree init;
3374 tree vect_ptr_type;
3375 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3376 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3377
3378 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3379 {
3380 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3381
3382 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3383
3384 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3385 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3386 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3387 }
3388 else
3389 {
3390 data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3391 base_offset = unshare_expr (DR_OFFSET (dr));
3392 init = unshare_expr (DR_INIT (dr));
3393 }
3394
3395 if (loop_vinfo)
3396 base_name = get_name (data_ref_base);
3397 else
3398 {
3399 base_offset = ssize_int (0);
3400 init = ssize_int (0);
3401 base_name = get_name (DR_REF (dr));
3402 }
3403
3404 /* Create base_offset */
3405 base_offset = size_binop (PLUS_EXPR,
3406 fold_convert (sizetype, base_offset),
3407 fold_convert (sizetype, init));
3408
3409 if (offset)
3410 {
3411 offset = fold_build2 (MULT_EXPR, sizetype,
3412 fold_convert (sizetype, offset), step);
3413 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3414 base_offset, offset);
3415 }
3416
3417 /* base + base_offset */
3418 if (loop_vinfo)
3419 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3420 else
3421 {
3422 addr_base = build1 (ADDR_EXPR,
3423 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3424 unshare_expr (DR_REF (dr)));
3425 }
3426
3427 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3428 addr_base = fold_convert (vect_ptr_type, addr_base);
3429 dest = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, base_name);
3430 addr_base = force_gimple_operand (addr_base, &seq, false, dest);
3431 gimple_seq_add_seq (new_stmt_list, seq);
3432
3433 if (DR_PTR_INFO (dr)
3434 && TREE_CODE (addr_base) == SSA_NAME)
3435 {
3436 duplicate_ssa_name_ptr_info (addr_base, DR_PTR_INFO (dr));
3437 if (offset)
3438 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr_base));
3439 }
3440
3441 if (dump_enabled_p ())
3442 {
3443 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3444 dump_generic_expr (MSG_NOTE, TDF_SLIM, addr_base);
3445 }
3446
3447 return addr_base;
3448 }
3449
3450
3451 /* Function vect_create_data_ref_ptr.
3452
3453 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3454 location accessed in the loop by STMT, along with the def-use update
3455 chain to appropriately advance the pointer through the loop iterations.
3456 Also set aliasing information for the pointer. This pointer is used by
3457 the callers to this function to create a memory reference expression for
3458 vector load/store access.
3459
3460 Input:
3461 1. STMT: a stmt that references memory. Expected to be of the form
3462 GIMPLE_ASSIGN <name, data-ref> or
3463 GIMPLE_ASSIGN <data-ref, name>.
3464 2. AGGR_TYPE: the type of the reference, which should be either a vector
3465 or an array.
3466 3. AT_LOOP: the loop where the vector memref is to be created.
3467 4. OFFSET (optional): an offset to be added to the initial address accessed
3468 by the data-ref in STMT.
3469 5. BSI: location where the new stmts are to be placed if there is no loop
3470 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3471 pointing to the initial address.
3472
3473 Output:
3474 1. Declare a new ptr to vector_type, and have it point to the base of the
3475 data reference (initial addressed accessed by the data reference).
3476 For example, for vector of type V8HI, the following code is generated:
3477
3478 v8hi *ap;
3479 ap = (v8hi *)initial_address;
3480
3481 if OFFSET is not supplied:
3482 initial_address = &a[init];
3483 if OFFSET is supplied:
3484 initial_address = &a[init + OFFSET];
3485
3486 Return the initial_address in INITIAL_ADDRESS.
3487
3488 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3489 update the pointer in each iteration of the loop.
3490
3491 Return the increment stmt that updates the pointer in PTR_INCR.
3492
3493 3. Set INV_P to true if the access pattern of the data reference in the
3494 vectorized loop is invariant. Set it to false otherwise.
3495
3496 4. Return the pointer. */
3497
3498 tree
3499 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3500 tree offset, tree *initial_address,
3501 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3502 bool only_init, bool *inv_p)
3503 {
3504 const char *base_name;
3505 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3506 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3507 struct loop *loop = NULL;
3508 bool nested_in_vect_loop = false;
3509 struct loop *containing_loop = NULL;
3510 tree aggr_ptr_type;
3511 tree aggr_ptr;
3512 tree new_temp;
3513 gimple vec_stmt;
3514 gimple_seq new_stmt_list = NULL;
3515 edge pe = NULL;
3516 basic_block new_bb;
3517 tree aggr_ptr_init;
3518 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3519 tree aptr;
3520 gimple_stmt_iterator incr_gsi;
3521 bool insert_after;
3522 bool negative;
3523 tree indx_before_incr, indx_after_incr;
3524 gimple incr;
3525 tree step;
3526 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3527
3528 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3529 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3530
3531 if (loop_vinfo)
3532 {
3533 loop = LOOP_VINFO_LOOP (loop_vinfo);
3534 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3535 containing_loop = (gimple_bb (stmt))->loop_father;
3536 pe = loop_preheader_edge (loop);
3537 }
3538 else
3539 {
3540 gcc_assert (bb_vinfo);
3541 only_init = true;
3542 *ptr_incr = NULL;
3543 }
3544
3545 /* Check the step (evolution) of the load in LOOP, and record
3546 whether it's invariant. */
3547 if (nested_in_vect_loop)
3548 step = STMT_VINFO_DR_STEP (stmt_info);
3549 else
3550 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3551
3552 if (tree_int_cst_compare (step, size_zero_node) == 0)
3553 *inv_p = true;
3554 else
3555 *inv_p = false;
3556 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3557
3558 /* Create an expression for the first address accessed by this load
3559 in LOOP. */
3560 base_name = get_name (DR_BASE_ADDRESS (dr));
3561
3562 if (dump_enabled_p ())
3563 {
3564 tree dr_base_type = TREE_TYPE (DR_BASE_OBJECT (dr));
3565 dump_printf_loc (MSG_NOTE, vect_location,
3566 "create %s-pointer variable to type: ",
3567 tree_code_name[(int) TREE_CODE (aggr_type)]);
3568 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3569 if (TREE_CODE (dr_base_type) == ARRAY_TYPE)
3570 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3571 else if (TREE_CODE (dr_base_type) == RECORD_TYPE)
3572 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3573 else
3574 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3575 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_BASE_OBJECT (dr));
3576 }
3577
3578 /* (1) Create the new aggregate-pointer variable.
3579 Vector and array types inherit the alias set of their component
3580 type by default so we need to use a ref-all pointer if the data
3581 reference does not conflict with the created aggregated data
3582 reference because it is not addressable. */
3583 bool need_ref_all = false;
3584 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3585 get_alias_set (DR_REF (dr))))
3586 need_ref_all = true;
3587 /* Likewise for any of the data references in the stmt group. */
3588 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3589 {
3590 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3591 do
3592 {
3593 stmt_vec_info sinfo = vinfo_for_stmt (orig_stmt);
3594 struct data_reference *sdr = STMT_VINFO_DATA_REF (sinfo);
3595 if (!alias_sets_conflict_p (get_alias_set (aggr_type),
3596 get_alias_set (DR_REF (sdr))))
3597 {
3598 need_ref_all = true;
3599 break;
3600 }
3601 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (sinfo);
3602 }
3603 while (orig_stmt);
3604 }
3605 aggr_ptr_type = build_pointer_type_for_mode (aggr_type, ptr_mode,
3606 need_ref_all);
3607 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var, base_name);
3608
3609
3610 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3611 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3612 def-use update cycles for the pointer: one relative to the outer-loop
3613 (LOOP), which is what steps (3) and (4) below do. The other is relative
3614 to the inner-loop (which is the inner-most loop containing the dataref),
3615 and this is done be step (5) below.
3616
3617 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3618 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3619 redundant. Steps (3),(4) create the following:
3620
3621 vp0 = &base_addr;
3622 LOOP: vp1 = phi(vp0,vp2)
3623 ...
3624 ...
3625 vp2 = vp1 + step
3626 goto LOOP
3627
3628 If there is an inner-loop nested in loop, then step (5) will also be
3629 applied, and an additional update in the inner-loop will be created:
3630
3631 vp0 = &base_addr;
3632 LOOP: vp1 = phi(vp0,vp2)
3633 ...
3634 inner: vp3 = phi(vp1,vp4)
3635 vp4 = vp3 + inner_step
3636 if () goto inner
3637 ...
3638 vp2 = vp1 + step
3639 if () goto LOOP */
3640
3641 /* (2) Calculate the initial address of the aggregate-pointer, and set
3642 the aggregate-pointer to point to it before the loop. */
3643
3644 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3645
3646 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3647 offset, loop);
3648 if (new_stmt_list)
3649 {
3650 if (pe)
3651 {
3652 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3653 gcc_assert (!new_bb);
3654 }
3655 else
3656 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3657 }
3658
3659 *initial_address = new_temp;
3660
3661 /* Create: p = (aggr_type *) initial_base */
3662 if (TREE_CODE (new_temp) != SSA_NAME
3663 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3664 {
3665 vec_stmt = gimple_build_assign (aggr_ptr,
3666 fold_convert (aggr_ptr_type, new_temp));
3667 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3668 /* Copy the points-to information if it exists. */
3669 if (DR_PTR_INFO (dr))
3670 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3671 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3672 if (pe)
3673 {
3674 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3675 gcc_assert (!new_bb);
3676 }
3677 else
3678 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3679 }
3680 else
3681 aggr_ptr_init = new_temp;
3682
3683 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3684 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3685 inner-loop nested in LOOP (during outer-loop vectorization). */
3686
3687 /* No update in loop is required. */
3688 if (only_init && (!loop_vinfo || at_loop == loop))
3689 aptr = aggr_ptr_init;
3690 else
3691 {
3692 /* The step of the aggregate pointer is the type size. */
3693 tree step = TYPE_SIZE_UNIT (aggr_type);
3694 /* One exception to the above is when the scalar step of the load in
3695 LOOP is zero. In this case the step here is also zero. */
3696 if (*inv_p)
3697 step = size_zero_node;
3698 else if (negative)
3699 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3700
3701 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3702
3703 create_iv (aggr_ptr_init,
3704 fold_convert (aggr_ptr_type, step),
3705 aggr_ptr, loop, &incr_gsi, insert_after,
3706 &indx_before_incr, &indx_after_incr);
3707 incr = gsi_stmt (incr_gsi);
3708 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3709
3710 /* Copy the points-to information if it exists. */
3711 if (DR_PTR_INFO (dr))
3712 {
3713 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3714 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3715 }
3716 if (ptr_incr)
3717 *ptr_incr = incr;
3718
3719 aptr = indx_before_incr;
3720 }
3721
3722 if (!nested_in_vect_loop || only_init)
3723 return aptr;
3724
3725
3726 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3727 nested in LOOP, if exists. */
3728
3729 gcc_assert (nested_in_vect_loop);
3730 if (!only_init)
3731 {
3732 standard_iv_increment_position (containing_loop, &incr_gsi,
3733 &insert_after);
3734 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3735 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3736 &indx_after_incr);
3737 incr = gsi_stmt (incr_gsi);
3738 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3739
3740 /* Copy the points-to information if it exists. */
3741 if (DR_PTR_INFO (dr))
3742 {
3743 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3744 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3745 }
3746 if (ptr_incr)
3747 *ptr_incr = incr;
3748
3749 return indx_before_incr;
3750 }
3751 else
3752 gcc_unreachable ();
3753 }
3754
3755
3756 /* Function bump_vector_ptr
3757
3758 Increment a pointer (to a vector type) by vector-size. If requested,
3759 i.e. if PTR-INCR is given, then also connect the new increment stmt
3760 to the existing def-use update-chain of the pointer, by modifying
3761 the PTR_INCR as illustrated below:
3762
3763 The pointer def-use update-chain before this function:
3764 DATAREF_PTR = phi (p_0, p_2)
3765 ....
3766 PTR_INCR: p_2 = DATAREF_PTR + step
3767
3768 The pointer def-use update-chain after this function:
3769 DATAREF_PTR = phi (p_0, p_2)
3770 ....
3771 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
3772 ....
3773 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
3774
3775 Input:
3776 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
3777 in the loop.
3778 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
3779 the loop. The increment amount across iterations is expected
3780 to be vector_size.
3781 BSI - location where the new update stmt is to be placed.
3782 STMT - the original scalar memory-access stmt that is being vectorized.
3783 BUMP - optional. The offset by which to bump the pointer. If not given,
3784 the offset is assumed to be vector_size.
3785
3786 Output: Return NEW_DATAREF_PTR as illustrated above.
3787
3788 */
3789
3790 tree
3791 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
3792 gimple stmt, tree bump)
3793 {
3794 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3795 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3796 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
3797 tree update = TYPE_SIZE_UNIT (vectype);
3798 gimple incr_stmt;
3799 ssa_op_iter iter;
3800 use_operand_p use_p;
3801 tree new_dataref_ptr;
3802
3803 if (bump)
3804 update = bump;
3805
3806 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
3807 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
3808 dataref_ptr, update);
3809 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
3810
3811 /* Copy the points-to information if it exists. */
3812 if (DR_PTR_INFO (dr))
3813 {
3814 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
3815 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
3816 }
3817
3818 if (!ptr_incr)
3819 return new_dataref_ptr;
3820
3821 /* Update the vector-pointer's cross-iteration increment. */
3822 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
3823 {
3824 tree use = USE_FROM_PTR (use_p);
3825
3826 if (use == dataref_ptr)
3827 SET_USE (use_p, new_dataref_ptr);
3828 else
3829 gcc_assert (tree_int_cst_compare (use, update) == 0);
3830 }
3831
3832 return new_dataref_ptr;
3833 }
3834
3835
3836 /* Function vect_create_destination_var.
3837
3838 Create a new temporary of type VECTYPE. */
3839
3840 tree
3841 vect_create_destination_var (tree scalar_dest, tree vectype)
3842 {
3843 tree vec_dest;
3844 const char *new_name;
3845 tree type;
3846 enum vect_var_kind kind;
3847
3848 kind = vectype ? vect_simple_var : vect_scalar_var;
3849 type = vectype ? vectype : TREE_TYPE (scalar_dest);
3850
3851 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
3852
3853 new_name = get_name (scalar_dest);
3854 if (!new_name)
3855 new_name = "var_";
3856 vec_dest = vect_get_new_vect_var (type, kind, new_name);
3857
3858 return vec_dest;
3859 }
3860
3861 /* Function vect_grouped_store_supported.
3862
3863 Returns TRUE if interleave high and interleave low permutations
3864 are supported, and FALSE otherwise. */
3865
3866 bool
3867 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
3868 {
3869 enum machine_mode mode = TYPE_MODE (vectype);
3870
3871 /* vect_permute_store_chain requires the group size to be a power of two. */
3872 if (exact_log2 (count) == -1)
3873 {
3874 if (dump_enabled_p ())
3875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3876 "the size of the group of accesses"
3877 " is not a power of 2");
3878 return false;
3879 }
3880
3881 /* Check that the permutation is supported. */
3882 if (VECTOR_MODE_P (mode))
3883 {
3884 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3885 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3886 for (i = 0; i < nelt / 2; i++)
3887 {
3888 sel[i * 2] = i;
3889 sel[i * 2 + 1] = i + nelt;
3890 }
3891 if (can_vec_perm_p (mode, false, sel))
3892 {
3893 for (i = 0; i < nelt; i++)
3894 sel[i] += nelt / 2;
3895 if (can_vec_perm_p (mode, false, sel))
3896 return true;
3897 }
3898 }
3899
3900 if (dump_enabled_p ())
3901 dump_printf (MSG_MISSED_OPTIMIZATION,
3902 "interleave op not supported by target.");
3903 return false;
3904 }
3905
3906
3907 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
3908 type VECTYPE. */
3909
3910 bool
3911 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
3912 {
3913 return vect_lanes_optab_supported_p ("vec_store_lanes",
3914 vec_store_lanes_optab,
3915 vectype, count);
3916 }
3917
3918
3919 /* Function vect_permute_store_chain.
3920
3921 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
3922 a power of 2, generate interleave_high/low stmts to reorder the data
3923 correctly for the stores. Return the final references for stores in
3924 RESULT_CHAIN.
3925
3926 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
3927 The input is 4 vectors each containing 8 elements. We assign a number to
3928 each element, the input sequence is:
3929
3930 1st vec: 0 1 2 3 4 5 6 7
3931 2nd vec: 8 9 10 11 12 13 14 15
3932 3rd vec: 16 17 18 19 20 21 22 23
3933 4th vec: 24 25 26 27 28 29 30 31
3934
3935 The output sequence should be:
3936
3937 1st vec: 0 8 16 24 1 9 17 25
3938 2nd vec: 2 10 18 26 3 11 19 27
3939 3rd vec: 4 12 20 28 5 13 21 30
3940 4th vec: 6 14 22 30 7 15 23 31
3941
3942 i.e., we interleave the contents of the four vectors in their order.
3943
3944 We use interleave_high/low instructions to create such output. The input of
3945 each interleave_high/low operation is two vectors:
3946 1st vec 2nd vec
3947 0 1 2 3 4 5 6 7
3948 the even elements of the result vector are obtained left-to-right from the
3949 high/low elements of the first vector. The odd elements of the result are
3950 obtained left-to-right from the high/low elements of the second vector.
3951 The output of interleave_high will be: 0 4 1 5
3952 and of interleave_low: 2 6 3 7
3953
3954
3955 The permutation is done in log LENGTH stages. In each stage interleave_high
3956 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
3957 where the first argument is taken from the first half of DR_CHAIN and the
3958 second argument from it's second half.
3959 In our example,
3960
3961 I1: interleave_high (1st vec, 3rd vec)
3962 I2: interleave_low (1st vec, 3rd vec)
3963 I3: interleave_high (2nd vec, 4th vec)
3964 I4: interleave_low (2nd vec, 4th vec)
3965
3966 The output for the first stage is:
3967
3968 I1: 0 16 1 17 2 18 3 19
3969 I2: 4 20 5 21 6 22 7 23
3970 I3: 8 24 9 25 10 26 11 27
3971 I4: 12 28 13 29 14 30 15 31
3972
3973 The output of the second stage, i.e. the final result is:
3974
3975 I1: 0 8 16 24 1 9 17 25
3976 I2: 2 10 18 26 3 11 19 27
3977 I3: 4 12 20 28 5 13 21 30
3978 I4: 6 14 22 30 7 15 23 31. */
3979
3980 void
3981 vect_permute_store_chain (vec<tree> dr_chain,
3982 unsigned int length,
3983 gimple stmt,
3984 gimple_stmt_iterator *gsi,
3985 vec<tree> *result_chain)
3986 {
3987 tree vect1, vect2, high, low;
3988 gimple perm_stmt;
3989 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
3990 tree perm_mask_low, perm_mask_high;
3991 unsigned int i, n;
3992 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
3993 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3994
3995 result_chain->quick_grow (length);
3996 memcpy (result_chain->address (), dr_chain.address (),
3997 length * sizeof (tree));
3998
3999 for (i = 0, n = nelt / 2; i < n; i++)
4000 {
4001 sel[i * 2] = i;
4002 sel[i * 2 + 1] = i + nelt;
4003 }
4004 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4005 gcc_assert (perm_mask_high != NULL);
4006
4007 for (i = 0; i < nelt; i++)
4008 sel[i] += nelt / 2;
4009 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4010 gcc_assert (perm_mask_low != NULL);
4011
4012 for (i = 0, n = exact_log2 (length); i < n; i++)
4013 {
4014 for (j = 0; j < length/2; j++)
4015 {
4016 vect1 = dr_chain[j];
4017 vect2 = dr_chain[j+length/2];
4018
4019 /* Create interleaving stmt:
4020 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4021 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4022 perm_stmt
4023 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4024 vect1, vect2, perm_mask_high);
4025 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4026 (*result_chain)[2*j] = high;
4027
4028 /* Create interleaving stmt:
4029 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4030 nelt*3/2+1, ...}> */
4031 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4032 perm_stmt
4033 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4034 vect1, vect2, perm_mask_low);
4035 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4036 (*result_chain)[2*j+1] = low;
4037 }
4038 memcpy (dr_chain.address (), result_chain->address (),
4039 length * sizeof (tree));
4040 }
4041 }
4042
4043 /* Function vect_setup_realignment
4044
4045 This function is called when vectorizing an unaligned load using
4046 the dr_explicit_realign[_optimized] scheme.
4047 This function generates the following code at the loop prolog:
4048
4049 p = initial_addr;
4050 x msq_init = *(floor(p)); # prolog load
4051 realignment_token = call target_builtin;
4052 loop:
4053 x msq = phi (msq_init, ---)
4054
4055 The stmts marked with x are generated only for the case of
4056 dr_explicit_realign_optimized.
4057
4058 The code above sets up a new (vector) pointer, pointing to the first
4059 location accessed by STMT, and a "floor-aligned" load using that pointer.
4060 It also generates code to compute the "realignment-token" (if the relevant
4061 target hook was defined), and creates a phi-node at the loop-header bb
4062 whose arguments are the result of the prolog-load (created by this
4063 function) and the result of a load that takes place in the loop (to be
4064 created by the caller to this function).
4065
4066 For the case of dr_explicit_realign_optimized:
4067 The caller to this function uses the phi-result (msq) to create the
4068 realignment code inside the loop, and sets up the missing phi argument,
4069 as follows:
4070 loop:
4071 msq = phi (msq_init, lsq)
4072 lsq = *(floor(p')); # load in loop
4073 result = realign_load (msq, lsq, realignment_token);
4074
4075 For the case of dr_explicit_realign:
4076 loop:
4077 msq = *(floor(p)); # load in loop
4078 p' = p + (VS-1);
4079 lsq = *(floor(p')); # load in loop
4080 result = realign_load (msq, lsq, realignment_token);
4081
4082 Input:
4083 STMT - (scalar) load stmt to be vectorized. This load accesses
4084 a memory location that may be unaligned.
4085 BSI - place where new code is to be inserted.
4086 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4087 is used.
4088
4089 Output:
4090 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4091 target hook, if defined.
4092 Return value - the result of the loop-header phi node. */
4093
4094 tree
4095 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4096 tree *realignment_token,
4097 enum dr_alignment_support alignment_support_scheme,
4098 tree init_addr,
4099 struct loop **at_loop)
4100 {
4101 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4102 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4103 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4104 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4105 struct loop *loop = NULL;
4106 edge pe = NULL;
4107 tree scalar_dest = gimple_assign_lhs (stmt);
4108 tree vec_dest;
4109 gimple inc;
4110 tree ptr;
4111 tree data_ref;
4112 gimple new_stmt;
4113 basic_block new_bb;
4114 tree msq_init = NULL_TREE;
4115 tree new_temp;
4116 gimple phi_stmt;
4117 tree msq = NULL_TREE;
4118 gimple_seq stmts = NULL;
4119 bool inv_p;
4120 bool compute_in_loop = false;
4121 bool nested_in_vect_loop = false;
4122 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4123 struct loop *loop_for_initial_load = NULL;
4124
4125 if (loop_vinfo)
4126 {
4127 loop = LOOP_VINFO_LOOP (loop_vinfo);
4128 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4129 }
4130
4131 gcc_assert (alignment_support_scheme == dr_explicit_realign
4132 || alignment_support_scheme == dr_explicit_realign_optimized);
4133
4134 /* We need to generate three things:
4135 1. the misalignment computation
4136 2. the extra vector load (for the optimized realignment scheme).
4137 3. the phi node for the two vectors from which the realignment is
4138 done (for the optimized realignment scheme). */
4139
4140 /* 1. Determine where to generate the misalignment computation.
4141
4142 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4143 calculation will be generated by this function, outside the loop (in the
4144 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4145 caller, inside the loop.
4146
4147 Background: If the misalignment remains fixed throughout the iterations of
4148 the loop, then both realignment schemes are applicable, and also the
4149 misalignment computation can be done outside LOOP. This is because we are
4150 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4151 are a multiple of VS (the Vector Size), and therefore the misalignment in
4152 different vectorized LOOP iterations is always the same.
4153 The problem arises only if the memory access is in an inner-loop nested
4154 inside LOOP, which is now being vectorized using outer-loop vectorization.
4155 This is the only case when the misalignment of the memory access may not
4156 remain fixed throughout the iterations of the inner-loop (as explained in
4157 detail in vect_supportable_dr_alignment). In this case, not only is the
4158 optimized realignment scheme not applicable, but also the misalignment
4159 computation (and generation of the realignment token that is passed to
4160 REALIGN_LOAD) have to be done inside the loop.
4161
4162 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4163 or not, which in turn determines if the misalignment is computed inside
4164 the inner-loop, or outside LOOP. */
4165
4166 if (init_addr != NULL_TREE || !loop_vinfo)
4167 {
4168 compute_in_loop = true;
4169 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4170 }
4171
4172
4173 /* 2. Determine where to generate the extra vector load.
4174
4175 For the optimized realignment scheme, instead of generating two vector
4176 loads in each iteration, we generate a single extra vector load in the
4177 preheader of the loop, and in each iteration reuse the result of the
4178 vector load from the previous iteration. In case the memory access is in
4179 an inner-loop nested inside LOOP, which is now being vectorized using
4180 outer-loop vectorization, we need to determine whether this initial vector
4181 load should be generated at the preheader of the inner-loop, or can be
4182 generated at the preheader of LOOP. If the memory access has no evolution
4183 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4184 to be generated inside LOOP (in the preheader of the inner-loop). */
4185
4186 if (nested_in_vect_loop)
4187 {
4188 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4189 bool invariant_in_outerloop =
4190 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4191 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4192 }
4193 else
4194 loop_for_initial_load = loop;
4195 if (at_loop)
4196 *at_loop = loop_for_initial_load;
4197
4198 if (loop_for_initial_load)
4199 pe = loop_preheader_edge (loop_for_initial_load);
4200
4201 /* 3. For the case of the optimized realignment, create the first vector
4202 load at the loop preheader. */
4203
4204 if (alignment_support_scheme == dr_explicit_realign_optimized)
4205 {
4206 /* Create msq_init = *(floor(p1)) in the loop preheader */
4207
4208 gcc_assert (!compute_in_loop);
4209 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4210 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4211 NULL_TREE, &init_addr, NULL, &inc,
4212 true, &inv_p);
4213 new_temp = copy_ssa_name (ptr, NULL);
4214 new_stmt = gimple_build_assign_with_ops
4215 (BIT_AND_EXPR, new_temp, ptr,
4216 build_int_cst (TREE_TYPE (ptr),
4217 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4218 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4219 gcc_assert (!new_bb);
4220 data_ref
4221 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4222 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4223 new_stmt = gimple_build_assign (vec_dest, data_ref);
4224 new_temp = make_ssa_name (vec_dest, new_stmt);
4225 gimple_assign_set_lhs (new_stmt, new_temp);
4226 if (pe)
4227 {
4228 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4229 gcc_assert (!new_bb);
4230 }
4231 else
4232 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4233
4234 msq_init = gimple_assign_lhs (new_stmt);
4235 }
4236
4237 /* 4. Create realignment token using a target builtin, if available.
4238 It is done either inside the containing loop, or before LOOP (as
4239 determined above). */
4240
4241 if (targetm.vectorize.builtin_mask_for_load)
4242 {
4243 tree builtin_decl;
4244
4245 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4246 if (!init_addr)
4247 {
4248 /* Generate the INIT_ADDR computation outside LOOP. */
4249 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4250 NULL_TREE, loop);
4251 if (loop)
4252 {
4253 pe = loop_preheader_edge (loop);
4254 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4255 gcc_assert (!new_bb);
4256 }
4257 else
4258 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4259 }
4260
4261 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4262 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4263 vec_dest =
4264 vect_create_destination_var (scalar_dest,
4265 gimple_call_return_type (new_stmt));
4266 new_temp = make_ssa_name (vec_dest, new_stmt);
4267 gimple_call_set_lhs (new_stmt, new_temp);
4268
4269 if (compute_in_loop)
4270 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4271 else
4272 {
4273 /* Generate the misalignment computation outside LOOP. */
4274 pe = loop_preheader_edge (loop);
4275 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4276 gcc_assert (!new_bb);
4277 }
4278
4279 *realignment_token = gimple_call_lhs (new_stmt);
4280
4281 /* The result of the CALL_EXPR to this builtin is determined from
4282 the value of the parameter and no global variables are touched
4283 which makes the builtin a "const" function. Requiring the
4284 builtin to have the "const" attribute makes it unnecessary
4285 to call mark_call_clobbered. */
4286 gcc_assert (TREE_READONLY (builtin_decl));
4287 }
4288
4289 if (alignment_support_scheme == dr_explicit_realign)
4290 return msq;
4291
4292 gcc_assert (!compute_in_loop);
4293 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4294
4295
4296 /* 5. Create msq = phi <msq_init, lsq> in loop */
4297
4298 pe = loop_preheader_edge (containing_loop);
4299 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4300 msq = make_ssa_name (vec_dest, NULL);
4301 phi_stmt = create_phi_node (msq, containing_loop->header);
4302 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4303
4304 return msq;
4305 }
4306
4307
4308 /* Function vect_grouped_load_supported.
4309
4310 Returns TRUE if even and odd permutations are supported,
4311 and FALSE otherwise. */
4312
4313 bool
4314 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4315 {
4316 enum machine_mode mode = TYPE_MODE (vectype);
4317
4318 /* vect_permute_load_chain requires the group size to be a power of two. */
4319 if (exact_log2 (count) == -1)
4320 {
4321 if (dump_enabled_p ())
4322 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4323 "the size of the group of accesses"
4324 " is not a power of 2");
4325 return false;
4326 }
4327
4328 /* Check that the permutation is supported. */
4329 if (VECTOR_MODE_P (mode))
4330 {
4331 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4332 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4333
4334 for (i = 0; i < nelt; i++)
4335 sel[i] = i * 2;
4336 if (can_vec_perm_p (mode, false, sel))
4337 {
4338 for (i = 0; i < nelt; i++)
4339 sel[i] = i * 2 + 1;
4340 if (can_vec_perm_p (mode, false, sel))
4341 return true;
4342 }
4343 }
4344
4345 if (dump_enabled_p ())
4346 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4347 "extract even/odd not supported by target");
4348 return false;
4349 }
4350
4351 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4352 type VECTYPE. */
4353
4354 bool
4355 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4356 {
4357 return vect_lanes_optab_supported_p ("vec_load_lanes",
4358 vec_load_lanes_optab,
4359 vectype, count);
4360 }
4361
4362 /* Function vect_permute_load_chain.
4363
4364 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4365 a power of 2, generate extract_even/odd stmts to reorder the input data
4366 correctly. Return the final references for loads in RESULT_CHAIN.
4367
4368 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4369 The input is 4 vectors each containing 8 elements. We assign a number to each
4370 element, the input sequence is:
4371
4372 1st vec: 0 1 2 3 4 5 6 7
4373 2nd vec: 8 9 10 11 12 13 14 15
4374 3rd vec: 16 17 18 19 20 21 22 23
4375 4th vec: 24 25 26 27 28 29 30 31
4376
4377 The output sequence should be:
4378
4379 1st vec: 0 4 8 12 16 20 24 28
4380 2nd vec: 1 5 9 13 17 21 25 29
4381 3rd vec: 2 6 10 14 18 22 26 30
4382 4th vec: 3 7 11 15 19 23 27 31
4383
4384 i.e., the first output vector should contain the first elements of each
4385 interleaving group, etc.
4386
4387 We use extract_even/odd instructions to create such output. The input of
4388 each extract_even/odd operation is two vectors
4389 1st vec 2nd vec
4390 0 1 2 3 4 5 6 7
4391
4392 and the output is the vector of extracted even/odd elements. The output of
4393 extract_even will be: 0 2 4 6
4394 and of extract_odd: 1 3 5 7
4395
4396
4397 The permutation is done in log LENGTH stages. In each stage extract_even
4398 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4399 their order. In our example,
4400
4401 E1: extract_even (1st vec, 2nd vec)
4402 E2: extract_odd (1st vec, 2nd vec)
4403 E3: extract_even (3rd vec, 4th vec)
4404 E4: extract_odd (3rd vec, 4th vec)
4405
4406 The output for the first stage will be:
4407
4408 E1: 0 2 4 6 8 10 12 14
4409 E2: 1 3 5 7 9 11 13 15
4410 E3: 16 18 20 22 24 26 28 30
4411 E4: 17 19 21 23 25 27 29 31
4412
4413 In order to proceed and create the correct sequence for the next stage (or
4414 for the correct output, if the second stage is the last one, as in our
4415 example), we first put the output of extract_even operation and then the
4416 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4417 The input for the second stage is:
4418
4419 1st vec (E1): 0 2 4 6 8 10 12 14
4420 2nd vec (E3): 16 18 20 22 24 26 28 30
4421 3rd vec (E2): 1 3 5 7 9 11 13 15
4422 4th vec (E4): 17 19 21 23 25 27 29 31
4423
4424 The output of the second stage:
4425
4426 E1: 0 4 8 12 16 20 24 28
4427 E2: 2 6 10 14 18 22 26 30
4428 E3: 1 5 9 13 17 21 25 29
4429 E4: 3 7 11 15 19 23 27 31
4430
4431 And RESULT_CHAIN after reordering:
4432
4433 1st vec (E1): 0 4 8 12 16 20 24 28
4434 2nd vec (E3): 1 5 9 13 17 21 25 29
4435 3rd vec (E2): 2 6 10 14 18 22 26 30
4436 4th vec (E4): 3 7 11 15 19 23 27 31. */
4437
4438 static void
4439 vect_permute_load_chain (vec<tree> dr_chain,
4440 unsigned int length,
4441 gimple stmt,
4442 gimple_stmt_iterator *gsi,
4443 vec<tree> *result_chain)
4444 {
4445 tree data_ref, first_vect, second_vect;
4446 tree perm_mask_even, perm_mask_odd;
4447 gimple perm_stmt;
4448 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4449 unsigned int i, j, log_length = exact_log2 (length);
4450 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4451 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4452
4453 result_chain->quick_grow (length);
4454 memcpy (result_chain->address (), dr_chain.address (),
4455 length * sizeof (tree));
4456
4457 for (i = 0; i < nelt; ++i)
4458 sel[i] = i * 2;
4459 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4460 gcc_assert (perm_mask_even != NULL);
4461
4462 for (i = 0; i < nelt; ++i)
4463 sel[i] = i * 2 + 1;
4464 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4465 gcc_assert (perm_mask_odd != NULL);
4466
4467 for (i = 0; i < log_length; i++)
4468 {
4469 for (j = 0; j < length; j += 2)
4470 {
4471 first_vect = dr_chain[j];
4472 second_vect = dr_chain[j+1];
4473
4474 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4475 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4476 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4477 first_vect, second_vect,
4478 perm_mask_even);
4479 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4480 (*result_chain)[j/2] = data_ref;
4481
4482 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4483 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4484 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4485 first_vect, second_vect,
4486 perm_mask_odd);
4487 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4488 (*result_chain)[j/2+length/2] = data_ref;
4489 }
4490 memcpy (dr_chain.address (), result_chain->address (),
4491 length * sizeof (tree));
4492 }
4493 }
4494
4495
4496 /* Function vect_transform_grouped_load.
4497
4498 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4499 to perform their permutation and ascribe the result vectorized statements to
4500 the scalar statements.
4501 */
4502
4503 void
4504 vect_transform_grouped_load (gimple stmt, vec<tree> dr_chain, int size,
4505 gimple_stmt_iterator *gsi)
4506 {
4507 vec<tree> result_chain = vNULL;
4508
4509 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4510 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4511 vectors, that are ready for vector computation. */
4512 result_chain.create (size);
4513 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4514 vect_record_grouped_load_vectors (stmt, result_chain);
4515 result_chain.release ();
4516 }
4517
4518 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4519 generated as part of the vectorization of STMT. Assign the statement
4520 for each vector to the associated scalar statement. */
4521
4522 void
4523 vect_record_grouped_load_vectors (gimple stmt, vec<tree> result_chain)
4524 {
4525 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4526 gimple next_stmt, new_stmt;
4527 unsigned int i, gap_count;
4528 tree tmp_data_ref;
4529
4530 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4531 Since we scan the chain starting from it's first node, their order
4532 corresponds the order of data-refs in RESULT_CHAIN. */
4533 next_stmt = first_stmt;
4534 gap_count = 1;
4535 FOR_EACH_VEC_ELT (result_chain, i, tmp_data_ref)
4536 {
4537 if (!next_stmt)
4538 break;
4539
4540 /* Skip the gaps. Loads created for the gaps will be removed by dead
4541 code elimination pass later. No need to check for the first stmt in
4542 the group, since it always exists.
4543 GROUP_GAP is the number of steps in elements from the previous
4544 access (if there is no gap GROUP_GAP is 1). We skip loads that
4545 correspond to the gaps. */
4546 if (next_stmt != first_stmt
4547 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4548 {
4549 gap_count++;
4550 continue;
4551 }
4552
4553 while (next_stmt)
4554 {
4555 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4556 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4557 copies, and we put the new vector statement in the first available
4558 RELATED_STMT. */
4559 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4560 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4561 else
4562 {
4563 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4564 {
4565 gimple prev_stmt =
4566 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4567 gimple rel_stmt =
4568 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4569 while (rel_stmt)
4570 {
4571 prev_stmt = rel_stmt;
4572 rel_stmt =
4573 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4574 }
4575
4576 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4577 new_stmt;
4578 }
4579 }
4580
4581 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4582 gap_count = 1;
4583 /* If NEXT_STMT accesses the same DR as the previous statement,
4584 put the same TMP_DATA_REF as its vectorized statement; otherwise
4585 get the next data-ref from RESULT_CHAIN. */
4586 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4587 break;
4588 }
4589 }
4590 }
4591
4592 /* Function vect_force_dr_alignment_p.
4593
4594 Returns whether the alignment of a DECL can be forced to be aligned
4595 on ALIGNMENT bit boundary. */
4596
4597 bool
4598 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4599 {
4600 if (TREE_CODE (decl) != VAR_DECL)
4601 return false;
4602
4603 /* We cannot change alignment of common or external symbols as another
4604 translation unit may contain a definition with lower alignment.
4605 The rules of common symbol linking mean that the definition
4606 will override the common symbol. The same is true for constant
4607 pool entries which may be shared and are not properly merged
4608 by LTO. */
4609 if (DECL_EXTERNAL (decl)
4610 || DECL_COMMON (decl)
4611 || DECL_IN_CONSTANT_POOL (decl))
4612 return false;
4613
4614 if (TREE_ASM_WRITTEN (decl))
4615 return false;
4616
4617 /* Do not override the alignment as specified by the ABI when the used
4618 attribute is set. */
4619 if (DECL_PRESERVE_P (decl))
4620 return false;
4621
4622 /* Do not override explicit alignment set by the user when an explicit
4623 section name is also used. This is a common idiom used by many
4624 software projects. */
4625 if (DECL_SECTION_NAME (decl) != NULL_TREE
4626 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4627 return false;
4628
4629 if (TREE_STATIC (decl))
4630 return (alignment <= MAX_OFILE_ALIGNMENT);
4631 else
4632 return (alignment <= MAX_STACK_ALIGNMENT);
4633 }
4634
4635
4636 /* Return whether the data reference DR is supported with respect to its
4637 alignment.
4638 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4639 it is aligned, i.e., check if it is possible to vectorize it with different
4640 alignment. */
4641
4642 enum dr_alignment_support
4643 vect_supportable_dr_alignment (struct data_reference *dr,
4644 bool check_aligned_accesses)
4645 {
4646 gimple stmt = DR_STMT (dr);
4647 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4648 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4649 enum machine_mode mode = TYPE_MODE (vectype);
4650 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4651 struct loop *vect_loop = NULL;
4652 bool nested_in_vect_loop = false;
4653
4654 if (aligned_access_p (dr) && !check_aligned_accesses)
4655 return dr_aligned;
4656
4657 if (loop_vinfo)
4658 {
4659 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4660 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4661 }
4662
4663 /* Possibly unaligned access. */
4664
4665 /* We can choose between using the implicit realignment scheme (generating
4666 a misaligned_move stmt) and the explicit realignment scheme (generating
4667 aligned loads with a REALIGN_LOAD). There are two variants to the
4668 explicit realignment scheme: optimized, and unoptimized.
4669 We can optimize the realignment only if the step between consecutive
4670 vector loads is equal to the vector size. Since the vector memory
4671 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4672 is guaranteed that the misalignment amount remains the same throughout the
4673 execution of the vectorized loop. Therefore, we can create the
4674 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4675 at the loop preheader.
4676
4677 However, in the case of outer-loop vectorization, when vectorizing a
4678 memory access in the inner-loop nested within the LOOP that is now being
4679 vectorized, while it is guaranteed that the misalignment of the
4680 vectorized memory access will remain the same in different outer-loop
4681 iterations, it is *not* guaranteed that is will remain the same throughout
4682 the execution of the inner-loop. This is because the inner-loop advances
4683 with the original scalar step (and not in steps of VS). If the inner-loop
4684 step happens to be a multiple of VS, then the misalignment remains fixed
4685 and we can use the optimized realignment scheme. For example:
4686
4687 for (i=0; i<N; i++)
4688 for (j=0; j<M; j++)
4689 s += a[i+j];
4690
4691 When vectorizing the i-loop in the above example, the step between
4692 consecutive vector loads is 1, and so the misalignment does not remain
4693 fixed across the execution of the inner-loop, and the realignment cannot
4694 be optimized (as illustrated in the following pseudo vectorized loop):
4695
4696 for (i=0; i<N; i+=4)
4697 for (j=0; j<M; j++){
4698 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4699 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4700 // (assuming that we start from an aligned address).
4701 }
4702
4703 We therefore have to use the unoptimized realignment scheme:
4704
4705 for (i=0; i<N; i+=4)
4706 for (j=k; j<M; j+=4)
4707 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4708 // that the misalignment of the initial address is
4709 // 0).
4710
4711 The loop can then be vectorized as follows:
4712
4713 for (k=0; k<4; k++){
4714 rt = get_realignment_token (&vp[k]);
4715 for (i=0; i<N; i+=4){
4716 v1 = vp[i+k];
4717 for (j=k; j<M; j+=4){
4718 v2 = vp[i+j+VS-1];
4719 va = REALIGN_LOAD <v1,v2,rt>;
4720 vs += va;
4721 v1 = v2;
4722 }
4723 }
4724 } */
4725
4726 if (DR_IS_READ (dr))
4727 {
4728 bool is_packed = false;
4729 tree type = (TREE_TYPE (DR_REF (dr)));
4730
4731 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4732 && (!targetm.vectorize.builtin_mask_for_load
4733 || targetm.vectorize.builtin_mask_for_load ()))
4734 {
4735 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4736 if ((nested_in_vect_loop
4737 && (TREE_INT_CST_LOW (DR_STEP (dr))
4738 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4739 || !loop_vinfo)
4740 return dr_explicit_realign;
4741 else
4742 return dr_explicit_realign_optimized;
4743 }
4744 if (!known_alignment_for_access_p (dr))
4745 is_packed = not_size_aligned (DR_REF (dr));
4746
4747 if (targetm.vectorize.
4748 support_vector_misalignment (mode, type,
4749 DR_MISALIGNMENT (dr), is_packed))
4750 /* Can't software pipeline the loads, but can at least do them. */
4751 return dr_unaligned_supported;
4752 }
4753 else
4754 {
4755 bool is_packed = false;
4756 tree type = (TREE_TYPE (DR_REF (dr)));
4757
4758 if (!known_alignment_for_access_p (dr))
4759 is_packed = not_size_aligned (DR_REF (dr));
4760
4761 if (targetm.vectorize.
4762 support_vector_misalignment (mode, type,
4763 DR_MISALIGNMENT (dr), is_packed))
4764 return dr_unaligned_supported;
4765 }
4766
4767 /* Unsupported. */
4768 return dr_unaligned_unsupported;
4769 }