whatis.cc: New file.
[gcc.git] / gcc / tree-vect-data-refs.c
1 /* Data References Analysis and Manipulation Utilities for Vectorization.
2 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
3 Free Software Foundation, Inc.
4 Contributed by Dorit Naishlos <dorit@il.ibm.com>
5 and Ira Rosen <irar@il.ibm.com>
6
7 This file is part of GCC.
8
9 GCC is free software; you can redistribute it and/or modify it under
10 the terms of the GNU General Public License as published by the Free
11 Software Foundation; either version 3, or (at your option) any later
12 version.
13
14 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
15 WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
17 for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with GCC; see the file COPYING3. If not see
21 <http://www.gnu.org/licenses/>. */
22
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "dumpfile.h"
27 #include "tm.h"
28 #include "ggc.h"
29 #include "tree.h"
30 #include "tm_p.h"
31 #include "target.h"
32 #include "basic-block.h"
33 #include "gimple-pretty-print.h"
34 #include "tree-flow.h"
35 #include "dumpfile.h"
36 #include "cfgloop.h"
37 #include "tree-chrec.h"
38 #include "tree-scalar-evolution.h"
39 #include "tree-vectorizer.h"
40 #include "diagnostic-core.h"
41
42 /* Need to include rtl.h, expr.h, etc. for optabs. */
43 #include "expr.h"
44 #include "optabs.h"
45
46 /* Return true if load- or store-lanes optab OPTAB is implemented for
47 COUNT vectors of type VECTYPE. NAME is the name of OPTAB. */
48
49 static bool
50 vect_lanes_optab_supported_p (const char *name, convert_optab optab,
51 tree vectype, unsigned HOST_WIDE_INT count)
52 {
53 enum machine_mode mode, array_mode;
54 bool limit_p;
55
56 mode = TYPE_MODE (vectype);
57 limit_p = !targetm.array_mode_supported_p (mode, count);
58 array_mode = mode_for_size (count * GET_MODE_BITSIZE (mode),
59 MODE_INT, limit_p);
60
61 if (array_mode == BLKmode)
62 {
63 if (dump_enabled_p ())
64 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
65 "no array mode for %s[" HOST_WIDE_INT_PRINT_DEC "]",
66 GET_MODE_NAME (mode), count);
67 return false;
68 }
69
70 if (convert_optab_handler (optab, array_mode, mode) == CODE_FOR_nothing)
71 {
72 if (dump_enabled_p ())
73 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
74 "cannot use %s<%s><%s>", name,
75 GET_MODE_NAME (array_mode), GET_MODE_NAME (mode));
76 return false;
77 }
78
79 if (dump_enabled_p ())
80 dump_printf_loc (MSG_NOTE, vect_location,
81 "can use %s<%s><%s>", name, GET_MODE_NAME (array_mode),
82 GET_MODE_NAME (mode));
83
84 return true;
85 }
86
87
88 /* Return the smallest scalar part of STMT.
89 This is used to determine the vectype of the stmt. We generally set the
90 vectype according to the type of the result (lhs). For stmts whose
91 result-type is different than the type of the arguments (e.g., demotion,
92 promotion), vectype will be reset appropriately (later). Note that we have
93 to visit the smallest datatype in this function, because that determines the
94 VF. If the smallest datatype in the loop is present only as the rhs of a
95 promotion operation - we'd miss it.
96 Such a case, where a variable of this datatype does not appear in the lhs
97 anywhere in the loop, can only occur if it's an invariant: e.g.:
98 'int_x = (int) short_inv', which we'd expect to have been optimized away by
99 invariant motion. However, we cannot rely on invariant motion to always
100 take invariants out of the loop, and so in the case of promotion we also
101 have to check the rhs.
102 LHS_SIZE_UNIT and RHS_SIZE_UNIT contain the sizes of the corresponding
103 types. */
104
105 tree
106 vect_get_smallest_scalar_type (gimple stmt, HOST_WIDE_INT *lhs_size_unit,
107 HOST_WIDE_INT *rhs_size_unit)
108 {
109 tree scalar_type = gimple_expr_type (stmt);
110 HOST_WIDE_INT lhs, rhs;
111
112 lhs = rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
113
114 if (is_gimple_assign (stmt)
115 && (gimple_assign_cast_p (stmt)
116 || gimple_assign_rhs_code (stmt) == WIDEN_MULT_EXPR
117 || gimple_assign_rhs_code (stmt) == WIDEN_LSHIFT_EXPR
118 || gimple_assign_rhs_code (stmt) == FLOAT_EXPR))
119 {
120 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
121
122 rhs = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (rhs_type));
123 if (rhs < lhs)
124 scalar_type = rhs_type;
125 }
126
127 *lhs_size_unit = lhs;
128 *rhs_size_unit = rhs;
129 return scalar_type;
130 }
131
132
133 /* Find the place of the data-ref in STMT in the interleaving chain that starts
134 from FIRST_STMT. Return -1 if the data-ref is not a part of the chain. */
135
136 int
137 vect_get_place_in_interleaving_chain (gimple stmt, gimple first_stmt)
138 {
139 gimple next_stmt = first_stmt;
140 int result = 0;
141
142 if (first_stmt != GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
143 return -1;
144
145 while (next_stmt && next_stmt != stmt)
146 {
147 result++;
148 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
149 }
150
151 if (next_stmt)
152 return result;
153 else
154 return -1;
155 }
156
157
158 /* Function vect_insert_into_interleaving_chain.
159
160 Insert DRA into the interleaving chain of DRB according to DRA's INIT. */
161
162 static void
163 vect_insert_into_interleaving_chain (struct data_reference *dra,
164 struct data_reference *drb)
165 {
166 gimple prev, next;
167 tree next_init;
168 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
169 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
170
171 prev = GROUP_FIRST_ELEMENT (stmtinfo_b);
172 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
173 while (next)
174 {
175 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
176 if (tree_int_cst_compare (next_init, DR_INIT (dra)) > 0)
177 {
178 /* Insert here. */
179 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra);
180 GROUP_NEXT_ELEMENT (stmtinfo_a) = next;
181 return;
182 }
183 prev = next;
184 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
185 }
186
187 /* We got to the end of the list. Insert here. */
188 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = DR_STMT (dra);
189 GROUP_NEXT_ELEMENT (stmtinfo_a) = NULL;
190 }
191
192
193 /* Function vect_update_interleaving_chain.
194
195 For two data-refs DRA and DRB that are a part of a chain interleaved data
196 accesses, update the interleaving chain. DRB's INIT is smaller than DRA's.
197
198 There are four possible cases:
199 1. New stmts - both DRA and DRB are not a part of any chain:
200 FIRST_DR = DRB
201 NEXT_DR (DRB) = DRA
202 2. DRB is a part of a chain and DRA is not:
203 no need to update FIRST_DR
204 no need to insert DRB
205 insert DRA according to init
206 3. DRA is a part of a chain and DRB is not:
207 if (init of FIRST_DR > init of DRB)
208 FIRST_DR = DRB
209 NEXT(FIRST_DR) = previous FIRST_DR
210 else
211 insert DRB according to its init
212 4. both DRA and DRB are in some interleaving chains:
213 choose the chain with the smallest init of FIRST_DR
214 insert the nodes of the second chain into the first one. */
215
216 static void
217 vect_update_interleaving_chain (struct data_reference *drb,
218 struct data_reference *dra)
219 {
220 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
221 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
222 tree next_init, init_dra_chain, init_drb_chain;
223 gimple first_a, first_b;
224 tree node_init;
225 gimple node, prev, next, first_stmt;
226
227 /* 1. New stmts - both DRA and DRB are not a part of any chain. */
228 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b))
229 {
230 GROUP_FIRST_ELEMENT (stmtinfo_a) = DR_STMT (drb);
231 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb);
232 GROUP_NEXT_ELEMENT (stmtinfo_b) = DR_STMT (dra);
233 return;
234 }
235
236 /* 2. DRB is a part of a chain and DRA is not. */
237 if (!GROUP_FIRST_ELEMENT (stmtinfo_a) && GROUP_FIRST_ELEMENT (stmtinfo_b))
238 {
239 GROUP_FIRST_ELEMENT (stmtinfo_a) = GROUP_FIRST_ELEMENT (stmtinfo_b);
240 /* Insert DRA into the chain of DRB. */
241 vect_insert_into_interleaving_chain (dra, drb);
242 return;
243 }
244
245 /* 3. DRA is a part of a chain and DRB is not. */
246 if (GROUP_FIRST_ELEMENT (stmtinfo_a) && !GROUP_FIRST_ELEMENT (stmtinfo_b))
247 {
248 gimple old_first_stmt = GROUP_FIRST_ELEMENT (stmtinfo_a);
249 tree init_old = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (
250 old_first_stmt)));
251 gimple tmp;
252
253 if (tree_int_cst_compare (init_old, DR_INIT (drb)) > 0)
254 {
255 /* DRB's init is smaller than the init of the stmt previously marked
256 as the first stmt of the interleaving chain of DRA. Therefore, we
257 update FIRST_STMT and put DRB in the head of the list. */
258 GROUP_FIRST_ELEMENT (stmtinfo_b) = DR_STMT (drb);
259 GROUP_NEXT_ELEMENT (stmtinfo_b) = old_first_stmt;
260
261 /* Update all the stmts in the list to point to the new FIRST_STMT. */
262 tmp = old_first_stmt;
263 while (tmp)
264 {
265 GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) = DR_STMT (drb);
266 tmp = GROUP_NEXT_ELEMENT (vinfo_for_stmt (tmp));
267 }
268 }
269 else
270 {
271 /* Insert DRB in the list of DRA. */
272 vect_insert_into_interleaving_chain (drb, dra);
273 GROUP_FIRST_ELEMENT (stmtinfo_b) = GROUP_FIRST_ELEMENT (stmtinfo_a);
274 }
275 return;
276 }
277
278 /* 4. both DRA and DRB are in some interleaving chains. */
279 first_a = GROUP_FIRST_ELEMENT (stmtinfo_a);
280 first_b = GROUP_FIRST_ELEMENT (stmtinfo_b);
281 if (first_a == first_b)
282 return;
283 init_dra_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_a)));
284 init_drb_chain = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_b)));
285
286 if (tree_int_cst_compare (init_dra_chain, init_drb_chain) > 0)
287 {
288 /* Insert the nodes of DRA chain into the DRB chain.
289 After inserting a node, continue from this node of the DRB chain (don't
290 start from the beginning. */
291 node = GROUP_FIRST_ELEMENT (stmtinfo_a);
292 prev = GROUP_FIRST_ELEMENT (stmtinfo_b);
293 first_stmt = first_b;
294 }
295 else
296 {
297 /* Insert the nodes of DRB chain into the DRA chain.
298 After inserting a node, continue from this node of the DRA chain (don't
299 start from the beginning. */
300 node = GROUP_FIRST_ELEMENT (stmtinfo_b);
301 prev = GROUP_FIRST_ELEMENT (stmtinfo_a);
302 first_stmt = first_a;
303 }
304
305 while (node)
306 {
307 node_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (node)));
308 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
309 while (next)
310 {
311 next_init = DR_INIT (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
312 if (tree_int_cst_compare (next_init, node_init) > 0)
313 {
314 /* Insert here. */
315 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node;
316 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = next;
317 prev = node;
318 break;
319 }
320 prev = next;
321 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev));
322 }
323 if (!next)
324 {
325 /* We got to the end of the list. Insert here. */
326 GROUP_NEXT_ELEMENT (vinfo_for_stmt (prev)) = node;
327 GROUP_NEXT_ELEMENT (vinfo_for_stmt (node)) = NULL;
328 prev = node;
329 }
330 GROUP_FIRST_ELEMENT (vinfo_for_stmt (node)) = first_stmt;
331 node = GROUP_NEXT_ELEMENT (vinfo_for_stmt (node));
332 }
333 }
334
335 /* Check dependence between DRA and DRB for basic block vectorization.
336 If the accesses share same bases and offsets, we can compare their initial
337 constant offsets to decide whether they differ or not. In case of a read-
338 write dependence we check that the load is before the store to ensure that
339 vectorization will not change the order of the accesses. */
340
341 static bool
342 vect_drs_dependent_in_basic_block (struct data_reference *dra,
343 struct data_reference *drb)
344 {
345 HOST_WIDE_INT type_size_a, type_size_b, init_a, init_b;
346 gimple earlier_stmt;
347
348 /* We only call this function for pairs of loads and stores, but we verify
349 it here. */
350 if (DR_IS_READ (dra) == DR_IS_READ (drb))
351 {
352 if (DR_IS_READ (dra))
353 return false;
354 else
355 return true;
356 }
357
358 /* Check that the data-refs have same bases and offsets. If not, we can't
359 determine if they are dependent. */
360 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
361 || !dr_equal_offsets_p (dra, drb))
362 return true;
363
364 /* Check the types. */
365 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
366 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
367
368 if (type_size_a != type_size_b
369 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
370 TREE_TYPE (DR_REF (drb))))
371 return true;
372
373 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
374 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
375
376 /* Two different locations - no dependence. */
377 if (init_a != init_b)
378 return false;
379
380 /* We have a read-write dependence. Check that the load is before the store.
381 When we vectorize basic blocks, vector load can be only before
382 corresponding scalar load, and vector store can be only after its
383 corresponding scalar store. So the order of the acceses is preserved in
384 case the load is before the store. */
385 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
386 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
387 return false;
388
389 return true;
390 }
391
392
393 /* Function vect_check_interleaving.
394
395 Check if DRA and DRB are a part of interleaving. In case they are, insert
396 DRA and DRB in an interleaving chain. */
397
398 static bool
399 vect_check_interleaving (struct data_reference *dra,
400 struct data_reference *drb)
401 {
402 HOST_WIDE_INT type_size_a, type_size_b, diff_mod_size, step, init_a, init_b;
403
404 /* Check that the data-refs have same first location (except init) and they
405 are both either store or load (not load and store). */
406 if (!operand_equal_p (DR_BASE_ADDRESS (dra), DR_BASE_ADDRESS (drb), 0)
407 || !dr_equal_offsets_p (dra, drb)
408 || !tree_int_cst_compare (DR_INIT (dra), DR_INIT (drb))
409 || DR_IS_READ (dra) != DR_IS_READ (drb))
410 return false;
411
412 /* Check:
413 1. data-refs are of the same type
414 2. their steps are equal
415 3. the step (if greater than zero) is greater than the difference between
416 data-refs' inits. */
417 type_size_a = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dra))));
418 type_size_b = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (drb))));
419
420 if (type_size_a != type_size_b
421 || tree_int_cst_compare (DR_STEP (dra), DR_STEP (drb))
422 || !types_compatible_p (TREE_TYPE (DR_REF (dra)),
423 TREE_TYPE (DR_REF (drb))))
424 return false;
425
426 init_a = TREE_INT_CST_LOW (DR_INIT (dra));
427 init_b = TREE_INT_CST_LOW (DR_INIT (drb));
428 step = TREE_INT_CST_LOW (DR_STEP (dra));
429
430 if (init_a > init_b)
431 {
432 /* If init_a == init_b + the size of the type * k, we have an interleaving,
433 and DRB is accessed before DRA. */
434 diff_mod_size = (init_a - init_b) % type_size_a;
435
436 if (step && (init_a - init_b) > step)
437 return false;
438
439 if (diff_mod_size == 0)
440 {
441 vect_update_interleaving_chain (drb, dra);
442 if (dump_enabled_p ())
443 {
444 dump_printf_loc (MSG_NOTE, vect_location,
445 "Detected interleaving ");
446 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
447 dump_printf (MSG_NOTE, " and ");
448 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
449 }
450 return true;
451 }
452 }
453 else
454 {
455 /* If init_b == init_a + the size of the type * k, we have an
456 interleaving, and DRA is accessed before DRB. */
457 diff_mod_size = (init_b - init_a) % type_size_a;
458
459 if (step && (init_b - init_a) > step)
460 return false;
461
462 if (diff_mod_size == 0)
463 {
464 vect_update_interleaving_chain (dra, drb);
465 if (dump_enabled_p ())
466 {
467 dump_printf_loc (MSG_NOTE, vect_location,
468 "Detected interleaving ");
469 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
470 dump_printf (MSG_NOTE, " and ");
471 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
472 }
473 return true;
474 }
475 }
476
477 return false;
478 }
479
480 /* Check if data references pointed by DR_I and DR_J are same or
481 belong to same interleaving group. Return FALSE if drs are
482 different, otherwise return TRUE. */
483
484 static bool
485 vect_same_range_drs (data_reference_p dr_i, data_reference_p dr_j)
486 {
487 gimple stmt_i = DR_STMT (dr_i);
488 gimple stmt_j = DR_STMT (dr_j);
489
490 if (operand_equal_p (DR_REF (dr_i), DR_REF (dr_j), 0)
491 || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
492 && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j))
493 && (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_i))
494 == GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt_j)))))
495 return true;
496 else
497 return false;
498 }
499
500 /* If address ranges represented by DDR_I and DDR_J are equal,
501 return TRUE, otherwise return FALSE. */
502
503 static bool
504 vect_vfa_range_equal (ddr_p ddr_i, ddr_p ddr_j)
505 {
506 if ((vect_same_range_drs (DDR_A (ddr_i), DDR_A (ddr_j))
507 && vect_same_range_drs (DDR_B (ddr_i), DDR_B (ddr_j)))
508 || (vect_same_range_drs (DDR_A (ddr_i), DDR_B (ddr_j))
509 && vect_same_range_drs (DDR_B (ddr_i), DDR_A (ddr_j))))
510 return true;
511 else
512 return false;
513 }
514
515 /* Insert DDR into LOOP_VINFO list of ddrs that may alias and need to be
516 tested at run-time. Return TRUE if DDR was successfully inserted.
517 Return false if versioning is not supported. */
518
519 static bool
520 vect_mark_for_runtime_alias_test (ddr_p ddr, loop_vec_info loop_vinfo)
521 {
522 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
523
524 if ((unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS) == 0)
525 return false;
526
527 if (dump_enabled_p ())
528 {
529 dump_printf_loc (MSG_NOTE, vect_location,
530 "mark for run-time aliasing test between ");
531 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr)));
532 dump_printf (MSG_NOTE, " and ");
533 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr)));
534 }
535
536 if (optimize_loop_nest_for_size_p (loop))
537 {
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
540 "versioning not supported when optimizing for size.");
541 return false;
542 }
543
544 /* FORNOW: We don't support versioning with outer-loop vectorization. */
545 if (loop->inner)
546 {
547 if (dump_enabled_p ())
548 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
549 "versioning not yet supported for outer-loops.");
550 return false;
551 }
552
553 /* FORNOW: We don't support creating runtime alias tests for non-constant
554 step. */
555 if (TREE_CODE (DR_STEP (DDR_A (ddr))) != INTEGER_CST
556 || TREE_CODE (DR_STEP (DDR_B (ddr))) != INTEGER_CST)
557 {
558 if (dump_enabled_p ())
559 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
560 "versioning not yet supported for non-constant "
561 "step");
562 return false;
563 }
564
565 VEC_safe_push (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo), ddr);
566 return true;
567 }
568
569
570 /* Function vect_analyze_data_ref_dependence.
571
572 Return TRUE if there (might) exist a dependence between a memory-reference
573 DRA and a memory-reference DRB. When versioning for alias may check a
574 dependence at run-time, return FALSE. Adjust *MAX_VF according to
575 the data dependence. */
576
577 static bool
578 vect_analyze_data_ref_dependence (struct data_dependence_relation *ddr,
579 loop_vec_info loop_vinfo, int *max_vf)
580 {
581 unsigned int i;
582 struct loop *loop = NULL;
583 struct data_reference *dra = DDR_A (ddr);
584 struct data_reference *drb = DDR_B (ddr);
585 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
586 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
587 lambda_vector dist_v;
588 unsigned int loop_depth;
589
590 /* Don't bother to analyze statements marked as unvectorizable. */
591 if (!STMT_VINFO_VECTORIZABLE (stmtinfo_a)
592 || !STMT_VINFO_VECTORIZABLE (stmtinfo_b))
593 return false;
594
595 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
596 {
597 /* Independent data accesses. */
598 vect_check_interleaving (dra, drb);
599 return false;
600 }
601
602 if (loop_vinfo)
603 loop = LOOP_VINFO_LOOP (loop_vinfo);
604
605 if ((DR_IS_READ (dra) && DR_IS_READ (drb) && loop_vinfo) || dra == drb)
606 return false;
607
608 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
609 {
610 gimple earlier_stmt;
611
612 if (loop_vinfo)
613 {
614 if (dump_enabled_p ())
615 {
616 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
617 "versioning for alias required: "
618 "can't determine dependence between ");
619 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
620 DR_REF (dra));
621 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
622 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
623 DR_REF (drb));
624 }
625
626 /* Add to list of ddrs that need to be tested at run-time. */
627 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
628 }
629
630 /* When vectorizing a basic block unknown depnedence can still mean
631 grouped access. */
632 if (vect_check_interleaving (dra, drb))
633 return false;
634
635 /* Read-read is OK (we need this check here, after checking for
636 interleaving). */
637 if (DR_IS_READ (dra) && DR_IS_READ (drb))
638 return false;
639
640 if (dump_enabled_p ())
641 {
642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
643 "can't determine dependence between ");
644 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
645 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
646 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
647 }
648
649 /* We do not vectorize basic blocks with write-write dependencies. */
650 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
651 return true;
652
653 /* Check that it's not a load-after-store dependence. */
654 earlier_stmt = get_earlier_stmt (DR_STMT (dra), DR_STMT (drb));
655 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (vinfo_for_stmt (earlier_stmt))))
656 return true;
657
658 return false;
659 }
660
661 /* Versioning for alias is not yet supported for basic block SLP, and
662 dependence distance is unapplicable, hence, in case of known data
663 dependence, basic block vectorization is impossible for now. */
664 if (!loop_vinfo)
665 {
666 if (dra != drb && vect_check_interleaving (dra, drb))
667 return false;
668
669 if (dump_enabled_p ())
670 {
671 dump_printf_loc (MSG_NOTE, vect_location,
672 "determined dependence between ");
673 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
674 dump_printf (MSG_NOTE, " and ");
675 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
676 }
677
678 /* Do not vectorize basic blcoks with write-write dependences. */
679 if (DR_IS_WRITE (dra) && DR_IS_WRITE (drb))
680 return true;
681
682 /* Check if this dependence is allowed in basic block vectorization. */
683 return vect_drs_dependent_in_basic_block (dra, drb);
684 }
685
686 /* Loop-based vectorization and known data dependence. */
687 if (DDR_NUM_DIST_VECTS (ddr) == 0)
688 {
689 if (dump_enabled_p ())
690 {
691 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
692 "versioning for alias required: "
693 "bad dist vector for ");
694 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dra));
695 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
696 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (drb));
697 }
698 /* Add to list of ddrs that need to be tested at run-time. */
699 return !vect_mark_for_runtime_alias_test (ddr, loop_vinfo);
700 }
701
702 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
703 FOR_EACH_VEC_ELT (lambda_vector, DDR_DIST_VECTS (ddr), i, dist_v)
704 {
705 int dist = dist_v[loop_depth];
706
707 if (dump_enabled_p ())
708 dump_printf_loc (MSG_NOTE, vect_location,
709 "dependence distance = %d.", dist);
710
711 if (dist == 0)
712 {
713 if (dump_enabled_p ())
714 {
715 dump_printf_loc (MSG_NOTE, vect_location,
716 "dependence distance == 0 between ");
717 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
718 dump_printf (MSG_NOTE, " and ");
719 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
720 }
721
722 /* For interleaving, mark that there is a read-write dependency if
723 necessary. We check before that one of the data-refs is store. */
724 if (DR_IS_READ (dra))
725 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_a) = true;
726 else
727 {
728 if (DR_IS_READ (drb))
729 GROUP_READ_WRITE_DEPENDENCE (stmtinfo_b) = true;
730 }
731
732 continue;
733 }
734
735 if (dist > 0 && DDR_REVERSED_P (ddr))
736 {
737 /* If DDR_REVERSED_P the order of the data-refs in DDR was
738 reversed (to make distance vector positive), and the actual
739 distance is negative. */
740 if (dump_enabled_p ())
741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
742 "dependence distance negative.");
743 continue;
744 }
745
746 if (abs (dist) >= 2
747 && abs (dist) < *max_vf)
748 {
749 /* The dependence distance requires reduction of the maximal
750 vectorization factor. */
751 *max_vf = abs (dist);
752 if (dump_enabled_p ())
753 dump_printf_loc (MSG_NOTE, vect_location,
754 "adjusting maximal vectorization factor to %i",
755 *max_vf);
756 }
757
758 if (abs (dist) >= *max_vf)
759 {
760 /* Dependence distance does not create dependence, as far as
761 vectorization is concerned, in this case. */
762 if (dump_enabled_p ())
763 dump_printf_loc (MSG_NOTE, vect_location,
764 "dependence distance >= VF.");
765 continue;
766 }
767
768 if (dump_enabled_p ())
769 {
770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
771 "not vectorized, possible dependence "
772 "between data-refs ");
773 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
774 dump_printf (MSG_NOTE, " and ");
775 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
776 }
777
778 return true;
779 }
780
781 return false;
782 }
783
784 /* Function vect_analyze_data_ref_dependences.
785
786 Examine all the data references in the loop, and make sure there do not
787 exist any data dependences between them. Set *MAX_VF according to
788 the maximum vectorization factor the data dependences allow. */
789
790 bool
791 vect_analyze_data_ref_dependences (loop_vec_info loop_vinfo,
792 bb_vec_info bb_vinfo, int *max_vf)
793 {
794 unsigned int i;
795 VEC (ddr_p, heap) *ddrs = NULL;
796 struct data_dependence_relation *ddr;
797
798 if (dump_enabled_p ())
799 dump_printf_loc (MSG_NOTE, vect_location,
800 "=== vect_analyze_dependences ===");
801 if (loop_vinfo)
802 ddrs = LOOP_VINFO_DDRS (loop_vinfo);
803 else
804 ddrs = BB_VINFO_DDRS (bb_vinfo);
805
806 FOR_EACH_VEC_ELT (ddr_p, ddrs, i, ddr)
807 if (vect_analyze_data_ref_dependence (ddr, loop_vinfo, max_vf))
808 return false;
809
810 return true;
811 }
812
813
814 /* Function vect_compute_data_ref_alignment
815
816 Compute the misalignment of the data reference DR.
817
818 Output:
819 1. If during the misalignment computation it is found that the data reference
820 cannot be vectorized then false is returned.
821 2. DR_MISALIGNMENT (DR) is defined.
822
823 FOR NOW: No analysis is actually performed. Misalignment is calculated
824 only for trivial cases. TODO. */
825
826 static bool
827 vect_compute_data_ref_alignment (struct data_reference *dr)
828 {
829 gimple stmt = DR_STMT (dr);
830 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
831 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
832 struct loop *loop = NULL;
833 tree ref = DR_REF (dr);
834 tree vectype;
835 tree base, base_addr;
836 bool base_aligned;
837 tree misalign;
838 tree aligned_to, alignment;
839
840 if (dump_enabled_p ())
841 dump_printf_loc (MSG_NOTE, vect_location,
842 "vect_compute_data_ref_alignment:");
843
844 if (loop_vinfo)
845 loop = LOOP_VINFO_LOOP (loop_vinfo);
846
847 /* Initialize misalignment to unknown. */
848 SET_DR_MISALIGNMENT (dr, -1);
849
850 /* Strided loads perform only component accesses, misalignment information
851 is irrelevant for them. */
852 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
853 return true;
854
855 misalign = DR_INIT (dr);
856 aligned_to = DR_ALIGNED_TO (dr);
857 base_addr = DR_BASE_ADDRESS (dr);
858 vectype = STMT_VINFO_VECTYPE (stmt_info);
859
860 /* In case the dataref is in an inner-loop of the loop that is being
861 vectorized (LOOP), we use the base and misalignment information
862 relative to the outer-loop (LOOP). This is ok only if the misalignment
863 stays the same throughout the execution of the inner-loop, which is why
864 we have to check that the stride of the dataref in the inner-loop evenly
865 divides by the vector size. */
866 if (loop && nested_in_vect_loop_p (loop, stmt))
867 {
868 tree step = DR_STEP (dr);
869 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
870
871 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) == 0)
872 {
873 if (dump_enabled_p ())
874 dump_printf_loc (MSG_NOTE, vect_location,
875 "inner step divides the vector-size.");
876 misalign = STMT_VINFO_DR_INIT (stmt_info);
877 aligned_to = STMT_VINFO_DR_ALIGNED_TO (stmt_info);
878 base_addr = STMT_VINFO_DR_BASE_ADDRESS (stmt_info);
879 }
880 else
881 {
882 if (dump_enabled_p ())
883 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
884 "inner step doesn't divide the vector-size.");
885 misalign = NULL_TREE;
886 }
887 }
888
889 /* Similarly, if we're doing basic-block vectorization, we can only use
890 base and misalignment information relative to an innermost loop if the
891 misalignment stays the same throughout the execution of the loop.
892 As above, this is the case if the stride of the dataref evenly divides
893 by the vector size. */
894 if (!loop)
895 {
896 tree step = DR_STEP (dr);
897 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
898
899 if (dr_step % GET_MODE_SIZE (TYPE_MODE (vectype)) != 0)
900 {
901 if (dump_enabled_p ())
902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
903 "SLP: step doesn't divide the vector-size.");
904 misalign = NULL_TREE;
905 }
906 }
907
908 base = build_fold_indirect_ref (base_addr);
909 alignment = ssize_int (TYPE_ALIGN (vectype)/BITS_PER_UNIT);
910
911 if ((aligned_to && tree_int_cst_compare (aligned_to, alignment) < 0)
912 || !misalign)
913 {
914 if (dump_enabled_p ())
915 {
916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
917 "Unknown alignment for access: ");
918 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, base);
919 }
920 return true;
921 }
922
923 if ((DECL_P (base)
924 && tree_int_cst_compare (ssize_int (DECL_ALIGN_UNIT (base)),
925 alignment) >= 0)
926 || (TREE_CODE (base_addr) == SSA_NAME
927 && tree_int_cst_compare (ssize_int (TYPE_ALIGN_UNIT (TREE_TYPE (
928 TREE_TYPE (base_addr)))),
929 alignment) >= 0)
930 || (get_pointer_alignment (base_addr) >= TYPE_ALIGN (vectype)))
931 base_aligned = true;
932 else
933 base_aligned = false;
934
935 if (!base_aligned)
936 {
937 /* Do not change the alignment of global variables here if
938 flag_section_anchors is enabled as we already generated
939 RTL for other functions. Most global variables should
940 have been aligned during the IPA increase_alignment pass. */
941 if (!vect_can_force_dr_alignment_p (base, TYPE_ALIGN (vectype))
942 || (TREE_STATIC (base) && flag_section_anchors))
943 {
944 if (dump_enabled_p ())
945 {
946 dump_printf_loc (MSG_NOTE, vect_location,
947 "can't force alignment of ref: ");
948 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
949 }
950 return true;
951 }
952
953 /* Force the alignment of the decl.
954 NOTE: This is the only change to the code we make during
955 the analysis phase, before deciding to vectorize the loop. */
956 if (dump_enabled_p ())
957 {
958 dump_printf_loc (MSG_NOTE, vect_location, "force alignment of ");
959 dump_generic_expr (MSG_NOTE, TDF_SLIM, ref);
960 }
961
962 DECL_ALIGN (base) = TYPE_ALIGN (vectype);
963 DECL_USER_ALIGN (base) = 1;
964 }
965
966 /* At this point we assume that the base is aligned. */
967 gcc_assert (base_aligned
968 || (TREE_CODE (base) == VAR_DECL
969 && DECL_ALIGN (base) >= TYPE_ALIGN (vectype)));
970
971 /* If this is a backward running DR then first access in the larger
972 vectype actually is N-1 elements before the address in the DR.
973 Adjust misalign accordingly. */
974 if (tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0)
975 {
976 tree offset = ssize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1);
977 /* DR_STEP(dr) is the same as -TYPE_SIZE of the scalar type,
978 otherwise we wouldn't be here. */
979 offset = fold_build2 (MULT_EXPR, ssizetype, offset, DR_STEP (dr));
980 /* PLUS because DR_STEP was negative. */
981 misalign = size_binop (PLUS_EXPR, misalign, offset);
982 }
983
984 /* Modulo alignment. */
985 misalign = size_binop (FLOOR_MOD_EXPR, misalign, alignment);
986
987 if (!host_integerp (misalign, 1))
988 {
989 /* Negative or overflowed misalignment value. */
990 if (dump_enabled_p ())
991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
992 "unexpected misalign value");
993 return false;
994 }
995
996 SET_DR_MISALIGNMENT (dr, TREE_INT_CST_LOW (misalign));
997
998 if (dump_enabled_p ())
999 {
1000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1001 "misalign = %d bytes of ref ", DR_MISALIGNMENT (dr));
1002 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, ref);
1003 }
1004
1005 return true;
1006 }
1007
1008
1009 /* Function vect_compute_data_refs_alignment
1010
1011 Compute the misalignment of data references in the loop.
1012 Return FALSE if a data reference is found that cannot be vectorized. */
1013
1014 static bool
1015 vect_compute_data_refs_alignment (loop_vec_info loop_vinfo,
1016 bb_vec_info bb_vinfo)
1017 {
1018 VEC (data_reference_p, heap) *datarefs;
1019 struct data_reference *dr;
1020 unsigned int i;
1021
1022 if (loop_vinfo)
1023 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1024 else
1025 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
1026
1027 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1028 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
1029 && !vect_compute_data_ref_alignment (dr))
1030 {
1031 if (bb_vinfo)
1032 {
1033 /* Mark unsupported statement as unvectorizable. */
1034 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
1035 continue;
1036 }
1037 else
1038 return false;
1039 }
1040
1041 return true;
1042 }
1043
1044
1045 /* Function vect_update_misalignment_for_peel
1046
1047 DR - the data reference whose misalignment is to be adjusted.
1048 DR_PEEL - the data reference whose misalignment is being made
1049 zero in the vector loop by the peel.
1050 NPEEL - the number of iterations in the peel loop if the misalignment
1051 of DR_PEEL is known at compile time. */
1052
1053 static void
1054 vect_update_misalignment_for_peel (struct data_reference *dr,
1055 struct data_reference *dr_peel, int npeel)
1056 {
1057 unsigned int i;
1058 VEC(dr_p,heap) *same_align_drs;
1059 struct data_reference *current_dr;
1060 int dr_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr))));
1061 int dr_peel_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr_peel))));
1062 stmt_vec_info stmt_info = vinfo_for_stmt (DR_STMT (dr));
1063 stmt_vec_info peel_stmt_info = vinfo_for_stmt (DR_STMT (dr_peel));
1064
1065 /* For interleaved data accesses the step in the loop must be multiplied by
1066 the size of the interleaving group. */
1067 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1068 dr_size *= GROUP_SIZE (vinfo_for_stmt (GROUP_FIRST_ELEMENT (stmt_info)));
1069 if (STMT_VINFO_GROUPED_ACCESS (peel_stmt_info))
1070 dr_peel_size *= GROUP_SIZE (peel_stmt_info);
1071
1072 /* It can be assumed that the data refs with the same alignment as dr_peel
1073 are aligned in the vector loop. */
1074 same_align_drs
1075 = STMT_VINFO_SAME_ALIGN_REFS (vinfo_for_stmt (DR_STMT (dr_peel)));
1076 FOR_EACH_VEC_ELT (dr_p, same_align_drs, i, current_dr)
1077 {
1078 if (current_dr != dr)
1079 continue;
1080 gcc_assert (DR_MISALIGNMENT (dr) / dr_size ==
1081 DR_MISALIGNMENT (dr_peel) / dr_peel_size);
1082 SET_DR_MISALIGNMENT (dr, 0);
1083 return;
1084 }
1085
1086 if (known_alignment_for_access_p (dr)
1087 && known_alignment_for_access_p (dr_peel))
1088 {
1089 bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1090 int misal = DR_MISALIGNMENT (dr);
1091 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1092 misal += negative ? -npeel * dr_size : npeel * dr_size;
1093 misal &= (TYPE_ALIGN (vectype) / BITS_PER_UNIT) - 1;
1094 SET_DR_MISALIGNMENT (dr, misal);
1095 return;
1096 }
1097
1098 if (dump_enabled_p ())
1099 dump_printf_loc (MSG_NOTE, vect_location, "Setting misalignment to -1.");
1100 SET_DR_MISALIGNMENT (dr, -1);
1101 }
1102
1103
1104 /* Function vect_verify_datarefs_alignment
1105
1106 Return TRUE if all data references in the loop can be
1107 handled with respect to alignment. */
1108
1109 bool
1110 vect_verify_datarefs_alignment (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
1111 {
1112 VEC (data_reference_p, heap) *datarefs;
1113 struct data_reference *dr;
1114 enum dr_alignment_support supportable_dr_alignment;
1115 unsigned int i;
1116
1117 if (loop_vinfo)
1118 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1119 else
1120 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
1121
1122 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1123 {
1124 gimple stmt = DR_STMT (dr);
1125 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1126
1127 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1128 continue;
1129
1130 /* For interleaving, only the alignment of the first access matters.
1131 Skip statements marked as not vectorizable. */
1132 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
1133 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1134 || !STMT_VINFO_VECTORIZABLE (stmt_info))
1135 continue;
1136
1137 /* Strided loads perform only component accesses, alignment is
1138 irrelevant for them. */
1139 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1140 continue;
1141
1142 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1143 if (!supportable_dr_alignment)
1144 {
1145 if (dump_enabled_p ())
1146 {
1147 if (DR_IS_READ (dr))
1148 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1149 "not vectorized: unsupported unaligned load.");
1150 else
1151 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1152 "not vectorized: unsupported unaligned "
1153 "store.");
1154
1155 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
1156 DR_REF (dr));
1157 }
1158 return false;
1159 }
1160 if (supportable_dr_alignment != dr_aligned && dump_enabled_p ())
1161 dump_printf_loc (MSG_NOTE, vect_location,
1162 "Vectorizing an unaligned access.");
1163 }
1164 return true;
1165 }
1166
1167 /* Given an memory reference EXP return whether its alignment is less
1168 than its size. */
1169
1170 static bool
1171 not_size_aligned (tree exp)
1172 {
1173 if (!host_integerp (TYPE_SIZE (TREE_TYPE (exp)), 1))
1174 return true;
1175
1176 return (TREE_INT_CST_LOW (TYPE_SIZE (TREE_TYPE (exp)))
1177 > get_object_alignment (exp));
1178 }
1179
1180 /* Function vector_alignment_reachable_p
1181
1182 Return true if vector alignment for DR is reachable by peeling
1183 a few loop iterations. Return false otherwise. */
1184
1185 static bool
1186 vector_alignment_reachable_p (struct data_reference *dr)
1187 {
1188 gimple stmt = DR_STMT (dr);
1189 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1190 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1191
1192 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1193 {
1194 /* For interleaved access we peel only if number of iterations in
1195 the prolog loop ({VF - misalignment}), is a multiple of the
1196 number of the interleaved accesses. */
1197 int elem_size, mis_in_elements;
1198 int nelements = TYPE_VECTOR_SUBPARTS (vectype);
1199
1200 /* FORNOW: handle only known alignment. */
1201 if (!known_alignment_for_access_p (dr))
1202 return false;
1203
1204 elem_size = GET_MODE_SIZE (TYPE_MODE (vectype)) / nelements;
1205 mis_in_elements = DR_MISALIGNMENT (dr) / elem_size;
1206
1207 if ((nelements - mis_in_elements) % GROUP_SIZE (stmt_info))
1208 return false;
1209 }
1210
1211 /* If misalignment is known at the compile time then allow peeling
1212 only if natural alignment is reachable through peeling. */
1213 if (known_alignment_for_access_p (dr) && !aligned_access_p (dr))
1214 {
1215 HOST_WIDE_INT elmsize =
1216 int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1217 if (dump_enabled_p ())
1218 {
1219 dump_printf_loc (MSG_NOTE, vect_location,
1220 "data size =" HOST_WIDE_INT_PRINT_DEC, elmsize);
1221 dump_printf (MSG_NOTE,
1222 ". misalignment = %d. ", DR_MISALIGNMENT (dr));
1223 }
1224 if (DR_MISALIGNMENT (dr) % elmsize)
1225 {
1226 if (dump_enabled_p ())
1227 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1228 "data size does not divide the misalignment.\n");
1229 return false;
1230 }
1231 }
1232
1233 if (!known_alignment_for_access_p (dr))
1234 {
1235 tree type = TREE_TYPE (DR_REF (dr));
1236 bool is_packed = not_size_aligned (DR_REF (dr));
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239 "Unknown misalignment, is_packed = %d",is_packed);
1240 if (targetm.vectorize.vector_alignment_reachable (type, is_packed))
1241 return true;
1242 else
1243 return false;
1244 }
1245
1246 return true;
1247 }
1248
1249
1250 /* Calculate the cost of the memory access represented by DR. */
1251
1252 static void
1253 vect_get_data_access_cost (struct data_reference *dr,
1254 unsigned int *inside_cost,
1255 unsigned int *outside_cost,
1256 stmt_vector_for_cost *body_cost_vec)
1257 {
1258 gimple stmt = DR_STMT (dr);
1259 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1260 int nunits = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
1261 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1262 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1263 int ncopies = vf / nunits;
1264
1265 if (DR_IS_READ (dr))
1266 vect_get_load_cost (dr, ncopies, true, inside_cost, outside_cost,
1267 NULL, body_cost_vec, false);
1268 else
1269 vect_get_store_cost (dr, ncopies, inside_cost, body_cost_vec);
1270
1271 if (dump_enabled_p ())
1272 dump_printf_loc (MSG_NOTE, vect_location,
1273 "vect_get_data_access_cost: inside_cost = %d, "
1274 "outside_cost = %d.", *inside_cost, *outside_cost);
1275 }
1276
1277
1278 static hashval_t
1279 vect_peeling_hash (const void *elem)
1280 {
1281 const struct _vect_peel_info *peel_info;
1282
1283 peel_info = (const struct _vect_peel_info *) elem;
1284 return (hashval_t) peel_info->npeel;
1285 }
1286
1287
1288 static int
1289 vect_peeling_hash_eq (const void *elem1, const void *elem2)
1290 {
1291 const struct _vect_peel_info *a, *b;
1292
1293 a = (const struct _vect_peel_info *) elem1;
1294 b = (const struct _vect_peel_info *) elem2;
1295 return (a->npeel == b->npeel);
1296 }
1297
1298
1299 /* Insert DR into peeling hash table with NPEEL as key. */
1300
1301 static void
1302 vect_peeling_hash_insert (loop_vec_info loop_vinfo, struct data_reference *dr,
1303 int npeel)
1304 {
1305 struct _vect_peel_info elem, *slot;
1306 void **new_slot;
1307 bool supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1308
1309 elem.npeel = npeel;
1310 slot = (vect_peel_info) htab_find (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1311 &elem);
1312 if (slot)
1313 slot->count++;
1314 else
1315 {
1316 slot = XNEW (struct _vect_peel_info);
1317 slot->npeel = npeel;
1318 slot->dr = dr;
1319 slot->count = 1;
1320 new_slot = htab_find_slot (LOOP_VINFO_PEELING_HTAB (loop_vinfo), slot,
1321 INSERT);
1322 *new_slot = slot;
1323 }
1324
1325 if (!supportable_dr_alignment && !flag_vect_cost_model)
1326 slot->count += VECT_MAX_COST;
1327 }
1328
1329
1330 /* Traverse peeling hash table to find peeling option that aligns maximum
1331 number of data accesses. */
1332
1333 static int
1334 vect_peeling_hash_get_most_frequent (void **slot, void *data)
1335 {
1336 vect_peel_info elem = (vect_peel_info) *slot;
1337 vect_peel_extended_info max = (vect_peel_extended_info) data;
1338
1339 if (elem->count > max->peel_info.count
1340 || (elem->count == max->peel_info.count
1341 && max->peel_info.npeel > elem->npeel))
1342 {
1343 max->peel_info.npeel = elem->npeel;
1344 max->peel_info.count = elem->count;
1345 max->peel_info.dr = elem->dr;
1346 }
1347
1348 return 1;
1349 }
1350
1351
1352 /* Traverse peeling hash table and calculate cost for each peeling option.
1353 Find the one with the lowest cost. */
1354
1355 static int
1356 vect_peeling_hash_get_lowest_cost (void **slot, void *data)
1357 {
1358 vect_peel_info elem = (vect_peel_info) *slot;
1359 vect_peel_extended_info min = (vect_peel_extended_info) data;
1360 int save_misalignment, dummy;
1361 unsigned int inside_cost = 0, outside_cost = 0, i;
1362 gimple stmt = DR_STMT (elem->dr);
1363 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1364 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
1365 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1366 struct data_reference *dr;
1367 stmt_vector_for_cost prologue_cost_vec, body_cost_vec, epilogue_cost_vec;
1368 int single_iter_cost;
1369
1370 prologue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
1371 body_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
1372 epilogue_cost_vec = VEC_alloc (stmt_info_for_cost, heap, 2);
1373
1374 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1375 {
1376 stmt = DR_STMT (dr);
1377 stmt_info = vinfo_for_stmt (stmt);
1378 /* For interleaving, only the alignment of the first access
1379 matters. */
1380 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1381 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1382 continue;
1383
1384 save_misalignment = DR_MISALIGNMENT (dr);
1385 vect_update_misalignment_for_peel (dr, elem->dr, elem->npeel);
1386 vect_get_data_access_cost (dr, &inside_cost, &outside_cost,
1387 &body_cost_vec);
1388 SET_DR_MISALIGNMENT (dr, save_misalignment);
1389 }
1390
1391 single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
1392 outside_cost += vect_get_known_peeling_cost (loop_vinfo, elem->npeel,
1393 &dummy, single_iter_cost,
1394 &prologue_cost_vec,
1395 &epilogue_cost_vec);
1396
1397 /* Prologue and epilogue costs are added to the target model later.
1398 These costs depend only on the scalar iteration cost, the
1399 number of peeling iterations finally chosen, and the number of
1400 misaligned statements. So discard the information found here. */
1401 VEC_free (stmt_info_for_cost, heap, prologue_cost_vec);
1402 VEC_free (stmt_info_for_cost, heap, epilogue_cost_vec);
1403
1404 if (inside_cost < min->inside_cost
1405 || (inside_cost == min->inside_cost && outside_cost < min->outside_cost))
1406 {
1407 min->inside_cost = inside_cost;
1408 min->outside_cost = outside_cost;
1409 VEC_free (stmt_info_for_cost, heap, min->body_cost_vec);
1410 min->body_cost_vec = body_cost_vec;
1411 min->peel_info.dr = elem->dr;
1412 min->peel_info.npeel = elem->npeel;
1413 }
1414 else
1415 VEC_free (stmt_info_for_cost, heap, body_cost_vec);
1416
1417 return 1;
1418 }
1419
1420
1421 /* Choose best peeling option by traversing peeling hash table and either
1422 choosing an option with the lowest cost (if cost model is enabled) or the
1423 option that aligns as many accesses as possible. */
1424
1425 static struct data_reference *
1426 vect_peeling_hash_choose_best_peeling (loop_vec_info loop_vinfo,
1427 unsigned int *npeel,
1428 stmt_vector_for_cost *body_cost_vec)
1429 {
1430 struct _vect_peel_extended_info res;
1431
1432 res.peel_info.dr = NULL;
1433 res.body_cost_vec = NULL;
1434
1435 if (flag_vect_cost_model)
1436 {
1437 res.inside_cost = INT_MAX;
1438 res.outside_cost = INT_MAX;
1439 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1440 vect_peeling_hash_get_lowest_cost, &res);
1441 }
1442 else
1443 {
1444 res.peel_info.count = 0;
1445 htab_traverse (LOOP_VINFO_PEELING_HTAB (loop_vinfo),
1446 vect_peeling_hash_get_most_frequent, &res);
1447 }
1448
1449 *npeel = res.peel_info.npeel;
1450 *body_cost_vec = res.body_cost_vec;
1451 return res.peel_info.dr;
1452 }
1453
1454
1455 /* Function vect_enhance_data_refs_alignment
1456
1457 This pass will use loop versioning and loop peeling in order to enhance
1458 the alignment of data references in the loop.
1459
1460 FOR NOW: we assume that whatever versioning/peeling takes place, only the
1461 original loop is to be vectorized. Any other loops that are created by
1462 the transformations performed in this pass - are not supposed to be
1463 vectorized. This restriction will be relaxed.
1464
1465 This pass will require a cost model to guide it whether to apply peeling
1466 or versioning or a combination of the two. For example, the scheme that
1467 intel uses when given a loop with several memory accesses, is as follows:
1468 choose one memory access ('p') which alignment you want to force by doing
1469 peeling. Then, either (1) generate a loop in which 'p' is aligned and all
1470 other accesses are not necessarily aligned, or (2) use loop versioning to
1471 generate one loop in which all accesses are aligned, and another loop in
1472 which only 'p' is necessarily aligned.
1473
1474 ("Automatic Intra-Register Vectorization for the Intel Architecture",
1475 Aart J.C. Bik, Milind Girkar, Paul M. Grey and Ximmin Tian, International
1476 Journal of Parallel Programming, Vol. 30, No. 2, April 2002.)
1477
1478 Devising a cost model is the most critical aspect of this work. It will
1479 guide us on which access to peel for, whether to use loop versioning, how
1480 many versions to create, etc. The cost model will probably consist of
1481 generic considerations as well as target specific considerations (on
1482 powerpc for example, misaligned stores are more painful than misaligned
1483 loads).
1484
1485 Here are the general steps involved in alignment enhancements:
1486
1487 -- original loop, before alignment analysis:
1488 for (i=0; i<N; i++){
1489 x = q[i]; # DR_MISALIGNMENT(q) = unknown
1490 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1491 }
1492
1493 -- After vect_compute_data_refs_alignment:
1494 for (i=0; i<N; i++){
1495 x = q[i]; # DR_MISALIGNMENT(q) = 3
1496 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1497 }
1498
1499 -- Possibility 1: we do loop versioning:
1500 if (p is aligned) {
1501 for (i=0; i<N; i++){ # loop 1A
1502 x = q[i]; # DR_MISALIGNMENT(q) = 3
1503 p[i] = y; # DR_MISALIGNMENT(p) = 0
1504 }
1505 }
1506 else {
1507 for (i=0; i<N; i++){ # loop 1B
1508 x = q[i]; # DR_MISALIGNMENT(q) = 3
1509 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1510 }
1511 }
1512
1513 -- Possibility 2: we do loop peeling:
1514 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1515 x = q[i];
1516 p[i] = y;
1517 }
1518 for (i = 3; i < N; i++){ # loop 2A
1519 x = q[i]; # DR_MISALIGNMENT(q) = 0
1520 p[i] = y; # DR_MISALIGNMENT(p) = unknown
1521 }
1522
1523 -- Possibility 3: combination of loop peeling and versioning:
1524 for (i = 0; i < 3; i++){ # (scalar loop, not to be vectorized).
1525 x = q[i];
1526 p[i] = y;
1527 }
1528 if (p is aligned) {
1529 for (i = 3; i<N; i++){ # loop 3A
1530 x = q[i]; # DR_MISALIGNMENT(q) = 0
1531 p[i] = y; # DR_MISALIGNMENT(p) = 0
1532 }
1533 }
1534 else {
1535 for (i = 3; i<N; i++){ # loop 3B
1536 x = q[i]; # DR_MISALIGNMENT(q) = 0
1537 p[i] = y; # DR_MISALIGNMENT(p) = unaligned
1538 }
1539 }
1540
1541 These loops are later passed to loop_transform to be vectorized. The
1542 vectorizer will use the alignment information to guide the transformation
1543 (whether to generate regular loads/stores, or with special handling for
1544 misalignment). */
1545
1546 bool
1547 vect_enhance_data_refs_alignment (loop_vec_info loop_vinfo)
1548 {
1549 VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1550 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1551 enum dr_alignment_support supportable_dr_alignment;
1552 struct data_reference *dr0 = NULL, *first_store = NULL;
1553 struct data_reference *dr;
1554 unsigned int i, j;
1555 bool do_peeling = false;
1556 bool do_versioning = false;
1557 bool stat;
1558 gimple stmt;
1559 stmt_vec_info stmt_info;
1560 int vect_versioning_for_alias_required;
1561 unsigned int npeel = 0;
1562 bool all_misalignments_unknown = true;
1563 unsigned int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1564 unsigned possible_npeel_number = 1;
1565 tree vectype;
1566 unsigned int nelements, mis, same_align_drs_max = 0;
1567 stmt_vector_for_cost body_cost_vec = NULL;
1568
1569 if (dump_enabled_p ())
1570 dump_printf_loc (MSG_NOTE, vect_location,
1571 "=== vect_enhance_data_refs_alignment ===");
1572
1573 /* While cost model enhancements are expected in the future, the high level
1574 view of the code at this time is as follows:
1575
1576 A) If there is a misaligned access then see if peeling to align
1577 this access can make all data references satisfy
1578 vect_supportable_dr_alignment. If so, update data structures
1579 as needed and return true.
1580
1581 B) If peeling wasn't possible and there is a data reference with an
1582 unknown misalignment that does not satisfy vect_supportable_dr_alignment
1583 then see if loop versioning checks can be used to make all data
1584 references satisfy vect_supportable_dr_alignment. If so, update
1585 data structures as needed and return true.
1586
1587 C) If neither peeling nor versioning were successful then return false if
1588 any data reference does not satisfy vect_supportable_dr_alignment.
1589
1590 D) Return true (all data references satisfy vect_supportable_dr_alignment).
1591
1592 Note, Possibility 3 above (which is peeling and versioning together) is not
1593 being done at this time. */
1594
1595 /* (1) Peeling to force alignment. */
1596
1597 /* (1.1) Decide whether to perform peeling, and how many iterations to peel:
1598 Considerations:
1599 + How many accesses will become aligned due to the peeling
1600 - How many accesses will become unaligned due to the peeling,
1601 and the cost of misaligned accesses.
1602 - The cost of peeling (the extra runtime checks, the increase
1603 in code size). */
1604
1605 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1606 {
1607 stmt = DR_STMT (dr);
1608 stmt_info = vinfo_for_stmt (stmt);
1609
1610 if (!STMT_VINFO_RELEVANT_P (stmt_info))
1611 continue;
1612
1613 /* For interleaving, only the alignment of the first access
1614 matters. */
1615 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1616 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1617 continue;
1618
1619 /* FORNOW: Any strided load prevents peeling. The induction
1620 variable analysis will fail when the prologue loop is generated,
1621 and so we can't generate the new base for the pointer. */
1622 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1623 {
1624 if (dump_enabled_p ())
1625 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1626 "strided load prevents peeling");
1627 do_peeling = false;
1628 break;
1629 }
1630
1631 /* For invariant accesses there is nothing to enhance. */
1632 if (integer_zerop (DR_STEP (dr)))
1633 continue;
1634
1635 /* Strided loads perform only component accesses, alignment is
1636 irrelevant for them. */
1637 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1638 continue;
1639
1640 supportable_dr_alignment = vect_supportable_dr_alignment (dr, true);
1641 do_peeling = vector_alignment_reachable_p (dr);
1642 if (do_peeling)
1643 {
1644 if (known_alignment_for_access_p (dr))
1645 {
1646 unsigned int npeel_tmp;
1647 bool negative = tree_int_cst_compare (DR_STEP (dr),
1648 size_zero_node) < 0;
1649
1650 /* Save info about DR in the hash table. */
1651 if (!LOOP_VINFO_PEELING_HTAB (loop_vinfo))
1652 LOOP_VINFO_PEELING_HTAB (loop_vinfo) =
1653 htab_create (1, vect_peeling_hash,
1654 vect_peeling_hash_eq, free);
1655
1656 vectype = STMT_VINFO_VECTYPE (stmt_info);
1657 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1658 mis = DR_MISALIGNMENT (dr) / GET_MODE_SIZE (TYPE_MODE (
1659 TREE_TYPE (DR_REF (dr))));
1660 npeel_tmp = (negative
1661 ? (mis - nelements) : (nelements - mis))
1662 & (nelements - 1);
1663
1664 /* For multiple types, it is possible that the bigger type access
1665 will have more than one peeling option. E.g., a loop with two
1666 types: one of size (vector size / 4), and the other one of
1667 size (vector size / 8). Vectorization factor will 8. If both
1668 access are misaligned by 3, the first one needs one scalar
1669 iteration to be aligned, and the second one needs 5. But the
1670 the first one will be aligned also by peeling 5 scalar
1671 iterations, and in that case both accesses will be aligned.
1672 Hence, except for the immediate peeling amount, we also want
1673 to try to add full vector size, while we don't exceed
1674 vectorization factor.
1675 We do this automtically for cost model, since we calculate cost
1676 for every peeling option. */
1677 if (!flag_vect_cost_model)
1678 possible_npeel_number = vf /nelements;
1679
1680 /* Handle the aligned case. We may decide to align some other
1681 access, making DR unaligned. */
1682 if (DR_MISALIGNMENT (dr) == 0)
1683 {
1684 npeel_tmp = 0;
1685 if (!flag_vect_cost_model)
1686 possible_npeel_number++;
1687 }
1688
1689 for (j = 0; j < possible_npeel_number; j++)
1690 {
1691 gcc_assert (npeel_tmp <= vf);
1692 vect_peeling_hash_insert (loop_vinfo, dr, npeel_tmp);
1693 npeel_tmp += nelements;
1694 }
1695
1696 all_misalignments_unknown = false;
1697 /* Data-ref that was chosen for the case that all the
1698 misalignments are unknown is not relevant anymore, since we
1699 have a data-ref with known alignment. */
1700 dr0 = NULL;
1701 }
1702 else
1703 {
1704 /* If we don't know all the misalignment values, we prefer
1705 peeling for data-ref that has maximum number of data-refs
1706 with the same alignment, unless the target prefers to align
1707 stores over load. */
1708 if (all_misalignments_unknown)
1709 {
1710 if (same_align_drs_max < VEC_length (dr_p,
1711 STMT_VINFO_SAME_ALIGN_REFS (stmt_info))
1712 || !dr0)
1713 {
1714 same_align_drs_max = VEC_length (dr_p,
1715 STMT_VINFO_SAME_ALIGN_REFS (stmt_info));
1716 dr0 = dr;
1717 }
1718
1719 if (!first_store && DR_IS_WRITE (dr))
1720 first_store = dr;
1721 }
1722
1723 /* If there are both known and unknown misaligned accesses in the
1724 loop, we choose peeling amount according to the known
1725 accesses. */
1726
1727
1728 if (!supportable_dr_alignment)
1729 {
1730 dr0 = dr;
1731 if (!first_store && DR_IS_WRITE (dr))
1732 first_store = dr;
1733 }
1734 }
1735 }
1736 else
1737 {
1738 if (!aligned_access_p (dr))
1739 {
1740 if (dump_enabled_p ())
1741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742 "vector alignment may not be reachable");
1743 break;
1744 }
1745 }
1746 }
1747
1748 vect_versioning_for_alias_required
1749 = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
1750
1751 /* Temporarily, if versioning for alias is required, we disable peeling
1752 until we support peeling and versioning. Often peeling for alignment
1753 will require peeling for loop-bound, which in turn requires that we
1754 know how to adjust the loop ivs after the loop. */
1755 if (vect_versioning_for_alias_required
1756 || !vect_can_advance_ivs_p (loop_vinfo)
1757 || !slpeel_can_duplicate_loop_p (loop, single_exit (loop)))
1758 do_peeling = false;
1759
1760 if (do_peeling && all_misalignments_unknown
1761 && vect_supportable_dr_alignment (dr0, false))
1762 {
1763
1764 /* Check if the target requires to prefer stores over loads, i.e., if
1765 misaligned stores are more expensive than misaligned loads (taking
1766 drs with same alignment into account). */
1767 if (first_store && DR_IS_READ (dr0))
1768 {
1769 unsigned int load_inside_cost = 0, load_outside_cost = 0;
1770 unsigned int store_inside_cost = 0, store_outside_cost = 0;
1771 unsigned int load_inside_penalty = 0, load_outside_penalty = 0;
1772 unsigned int store_inside_penalty = 0, store_outside_penalty = 0;
1773 stmt_vector_for_cost dummy = VEC_alloc (stmt_info_for_cost, heap, 2);
1774
1775 vect_get_data_access_cost (dr0, &load_inside_cost, &load_outside_cost,
1776 &dummy);
1777 vect_get_data_access_cost (first_store, &store_inside_cost,
1778 &store_outside_cost, &dummy);
1779
1780 VEC_free (stmt_info_for_cost, heap, dummy);
1781
1782 /* Calculate the penalty for leaving FIRST_STORE unaligned (by
1783 aligning the load DR0). */
1784 load_inside_penalty = store_inside_cost;
1785 load_outside_penalty = store_outside_cost;
1786 for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS
1787 (vinfo_for_stmt (DR_STMT (first_store))),
1788 i, dr);
1789 i++)
1790 if (DR_IS_READ (dr))
1791 {
1792 load_inside_penalty += load_inside_cost;
1793 load_outside_penalty += load_outside_cost;
1794 }
1795 else
1796 {
1797 load_inside_penalty += store_inside_cost;
1798 load_outside_penalty += store_outside_cost;
1799 }
1800
1801 /* Calculate the penalty for leaving DR0 unaligned (by
1802 aligning the FIRST_STORE). */
1803 store_inside_penalty = load_inside_cost;
1804 store_outside_penalty = load_outside_cost;
1805 for (i = 0; VEC_iterate (dr_p, STMT_VINFO_SAME_ALIGN_REFS
1806 (vinfo_for_stmt (DR_STMT (dr0))),
1807 i, dr);
1808 i++)
1809 if (DR_IS_READ (dr))
1810 {
1811 store_inside_penalty += load_inside_cost;
1812 store_outside_penalty += load_outside_cost;
1813 }
1814 else
1815 {
1816 store_inside_penalty += store_inside_cost;
1817 store_outside_penalty += store_outside_cost;
1818 }
1819
1820 if (load_inside_penalty > store_inside_penalty
1821 || (load_inside_penalty == store_inside_penalty
1822 && load_outside_penalty > store_outside_penalty))
1823 dr0 = first_store;
1824 }
1825
1826 /* In case there are only loads with different unknown misalignments, use
1827 peeling only if it may help to align other accesses in the loop. */
1828 if (!first_store && !VEC_length (dr_p, STMT_VINFO_SAME_ALIGN_REFS
1829 (vinfo_for_stmt (DR_STMT (dr0))))
1830 && vect_supportable_dr_alignment (dr0, false)
1831 != dr_unaligned_supported)
1832 do_peeling = false;
1833 }
1834
1835 if (do_peeling && !dr0)
1836 {
1837 /* Peeling is possible, but there is no data access that is not supported
1838 unless aligned. So we try to choose the best possible peeling. */
1839
1840 /* We should get here only if there are drs with known misalignment. */
1841 gcc_assert (!all_misalignments_unknown);
1842
1843 /* Choose the best peeling from the hash table. */
1844 dr0 = vect_peeling_hash_choose_best_peeling (loop_vinfo, &npeel,
1845 &body_cost_vec);
1846 if (!dr0 || !npeel)
1847 do_peeling = false;
1848 }
1849
1850 if (do_peeling)
1851 {
1852 stmt = DR_STMT (dr0);
1853 stmt_info = vinfo_for_stmt (stmt);
1854 vectype = STMT_VINFO_VECTYPE (stmt_info);
1855 nelements = TYPE_VECTOR_SUBPARTS (vectype);
1856
1857 if (known_alignment_for_access_p (dr0))
1858 {
1859 bool negative = tree_int_cst_compare (DR_STEP (dr0),
1860 size_zero_node) < 0;
1861 if (!npeel)
1862 {
1863 /* Since it's known at compile time, compute the number of
1864 iterations in the peeled loop (the peeling factor) for use in
1865 updating DR_MISALIGNMENT values. The peeling factor is the
1866 vectorization factor minus the misalignment as an element
1867 count. */
1868 mis = DR_MISALIGNMENT (dr0);
1869 mis /= GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr0))));
1870 npeel = ((negative ? mis - nelements : nelements - mis)
1871 & (nelements - 1));
1872 }
1873
1874 /* For interleaved data access every iteration accesses all the
1875 members of the group, therefore we divide the number of iterations
1876 by the group size. */
1877 stmt_info = vinfo_for_stmt (DR_STMT (dr0));
1878 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1879 npeel /= GROUP_SIZE (stmt_info);
1880
1881 if (dump_enabled_p ())
1882 dump_printf_loc (MSG_NOTE, vect_location,
1883 "Try peeling by %d", npeel);
1884 }
1885
1886 /* Ensure that all data refs can be vectorized after the peel. */
1887 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1888 {
1889 int save_misalignment;
1890
1891 if (dr == dr0)
1892 continue;
1893
1894 stmt = DR_STMT (dr);
1895 stmt_info = vinfo_for_stmt (stmt);
1896 /* For interleaving, only the alignment of the first access
1897 matters. */
1898 if (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1899 && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
1900 continue;
1901
1902 /* Strided loads perform only component accesses, alignment is
1903 irrelevant for them. */
1904 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
1905 continue;
1906
1907 save_misalignment = DR_MISALIGNMENT (dr);
1908 vect_update_misalignment_for_peel (dr, dr0, npeel);
1909 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
1910 SET_DR_MISALIGNMENT (dr, save_misalignment);
1911
1912 if (!supportable_dr_alignment)
1913 {
1914 do_peeling = false;
1915 break;
1916 }
1917 }
1918
1919 if (do_peeling && known_alignment_for_access_p (dr0) && npeel == 0)
1920 {
1921 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1922 if (!stat)
1923 do_peeling = false;
1924 else
1925 {
1926 VEC_free (stmt_info_for_cost, heap, body_cost_vec);
1927 return stat;
1928 }
1929 }
1930
1931 if (do_peeling)
1932 {
1933 stmt_info_for_cost *si;
1934 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
1935
1936 /* (1.2) Update the DR_MISALIGNMENT of each data reference DR_i.
1937 If the misalignment of DR_i is identical to that of dr0 then set
1938 DR_MISALIGNMENT (DR_i) to zero. If the misalignment of DR_i and
1939 dr0 are known at compile time then increment DR_MISALIGNMENT (DR_i)
1940 by the peeling factor times the element size of DR_i (MOD the
1941 vectorization factor times the size). Otherwise, the
1942 misalignment of DR_i must be set to unknown. */
1943 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
1944 if (dr != dr0)
1945 vect_update_misalignment_for_peel (dr, dr0, npeel);
1946
1947 LOOP_VINFO_UNALIGNED_DR (loop_vinfo) = dr0;
1948 if (npeel)
1949 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = npeel;
1950 else
1951 LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) = DR_MISALIGNMENT (dr0);
1952 SET_DR_MISALIGNMENT (dr0, 0);
1953 if (dump_enabled_p ())
1954 {
1955 dump_printf_loc (MSG_NOTE, vect_location,
1956 "Alignment of access forced using peeling.");
1957 dump_printf_loc (MSG_NOTE, vect_location,
1958 "Peeling for alignment will be applied.");
1959 }
1960 /* We've delayed passing the inside-loop peeling costs to the
1961 target cost model until we were sure peeling would happen.
1962 Do so now. */
1963 if (body_cost_vec)
1964 {
1965 FOR_EACH_VEC_ELT (stmt_info_for_cost, body_cost_vec, i, si)
1966 {
1967 struct _stmt_vec_info *stmt_info
1968 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1969 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
1970 si->misalign, vect_body);
1971 }
1972 VEC_free (stmt_info_for_cost, heap, body_cost_vec);
1973 }
1974
1975 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
1976 gcc_assert (stat);
1977 return stat;
1978 }
1979 }
1980
1981 VEC_free (stmt_info_for_cost, heap, body_cost_vec);
1982
1983 /* (2) Versioning to force alignment. */
1984
1985 /* Try versioning if:
1986 1) flag_tree_vect_loop_version is TRUE
1987 2) optimize loop for speed
1988 3) there is at least one unsupported misaligned data ref with an unknown
1989 misalignment, and
1990 4) all misaligned data refs with a known misalignment are supported, and
1991 5) the number of runtime alignment checks is within reason. */
1992
1993 do_versioning =
1994 flag_tree_vect_loop_version
1995 && optimize_loop_nest_for_speed_p (loop)
1996 && (!loop->inner); /* FORNOW */
1997
1998 if (do_versioning)
1999 {
2000 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
2001 {
2002 stmt = DR_STMT (dr);
2003 stmt_info = vinfo_for_stmt (stmt);
2004
2005 /* For interleaving, only the alignment of the first access
2006 matters. */
2007 if (aligned_access_p (dr)
2008 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2009 && GROUP_FIRST_ELEMENT (stmt_info) != stmt))
2010 continue;
2011
2012 /* Strided loads perform only component accesses, alignment is
2013 irrelevant for them. */
2014 if (STMT_VINFO_STRIDE_LOAD_P (stmt_info))
2015 continue;
2016
2017 supportable_dr_alignment = vect_supportable_dr_alignment (dr, false);
2018
2019 if (!supportable_dr_alignment)
2020 {
2021 gimple stmt;
2022 int mask;
2023 tree vectype;
2024
2025 if (known_alignment_for_access_p (dr)
2026 || VEC_length (gimple,
2027 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))
2028 >= (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS))
2029 {
2030 do_versioning = false;
2031 break;
2032 }
2033
2034 stmt = DR_STMT (dr);
2035 vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
2036 gcc_assert (vectype);
2037
2038 /* The rightmost bits of an aligned address must be zeros.
2039 Construct the mask needed for this test. For example,
2040 GET_MODE_SIZE for the vector mode V4SI is 16 bytes so the
2041 mask must be 15 = 0xf. */
2042 mask = GET_MODE_SIZE (TYPE_MODE (vectype)) - 1;
2043
2044 /* FORNOW: use the same mask to test all potentially unaligned
2045 references in the loop. The vectorizer currently supports
2046 a single vector size, see the reference to
2047 GET_MODE_NUNITS (TYPE_MODE (vectype)) where the
2048 vectorization factor is computed. */
2049 gcc_assert (!LOOP_VINFO_PTR_MASK (loop_vinfo)
2050 || LOOP_VINFO_PTR_MASK (loop_vinfo) == mask);
2051 LOOP_VINFO_PTR_MASK (loop_vinfo) = mask;
2052 VEC_safe_push (gimple, heap,
2053 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo),
2054 DR_STMT (dr));
2055 }
2056 }
2057
2058 /* Versioning requires at least one misaligned data reference. */
2059 if (!LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2060 do_versioning = false;
2061 else if (!do_versioning)
2062 VEC_truncate (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo), 0);
2063 }
2064
2065 if (do_versioning)
2066 {
2067 VEC(gimple,heap) *may_misalign_stmts
2068 = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2069 gimple stmt;
2070
2071 /* It can now be assumed that the data references in the statements
2072 in LOOP_VINFO_MAY_MISALIGN_STMTS will be aligned in the version
2073 of the loop being vectorized. */
2074 FOR_EACH_VEC_ELT (gimple, may_misalign_stmts, i, stmt)
2075 {
2076 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2077 dr = STMT_VINFO_DATA_REF (stmt_info);
2078 SET_DR_MISALIGNMENT (dr, 0);
2079 if (dump_enabled_p ())
2080 dump_printf_loc (MSG_NOTE, vect_location,
2081 "Alignment of access forced using versioning.");
2082 }
2083
2084 if (dump_enabled_p ())
2085 dump_printf_loc (MSG_NOTE, vect_location,
2086 "Versioning for alignment will be applied.");
2087
2088 /* Peeling and versioning can't be done together at this time. */
2089 gcc_assert (! (do_peeling && do_versioning));
2090
2091 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
2092 gcc_assert (stat);
2093 return stat;
2094 }
2095
2096 /* This point is reached if neither peeling nor versioning is being done. */
2097 gcc_assert (! (do_peeling || do_versioning));
2098
2099 stat = vect_verify_datarefs_alignment (loop_vinfo, NULL);
2100 return stat;
2101 }
2102
2103
2104 /* Function vect_find_same_alignment_drs.
2105
2106 Update group and alignment relations according to the chosen
2107 vectorization factor. */
2108
2109 static void
2110 vect_find_same_alignment_drs (struct data_dependence_relation *ddr,
2111 loop_vec_info loop_vinfo)
2112 {
2113 unsigned int i;
2114 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2115 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2116 struct data_reference *dra = DDR_A (ddr);
2117 struct data_reference *drb = DDR_B (ddr);
2118 stmt_vec_info stmtinfo_a = vinfo_for_stmt (DR_STMT (dra));
2119 stmt_vec_info stmtinfo_b = vinfo_for_stmt (DR_STMT (drb));
2120 int dra_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dra))));
2121 int drb_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (drb))));
2122 lambda_vector dist_v;
2123 unsigned int loop_depth;
2124
2125 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
2126 return;
2127
2128 if (dra == drb)
2129 return;
2130
2131 if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
2132 return;
2133
2134 /* Loop-based vectorization and known data dependence. */
2135 if (DDR_NUM_DIST_VECTS (ddr) == 0)
2136 return;
2137
2138 /* Data-dependence analysis reports a distance vector of zero
2139 for data-references that overlap only in the first iteration
2140 but have different sign step (see PR45764).
2141 So as a sanity check require equal DR_STEP. */
2142 if (!operand_equal_p (DR_STEP (dra), DR_STEP (drb), 0))
2143 return;
2144
2145 loop_depth = index_in_loop_nest (loop->num, DDR_LOOP_NEST (ddr));
2146 FOR_EACH_VEC_ELT (lambda_vector, DDR_DIST_VECTS (ddr), i, dist_v)
2147 {
2148 int dist = dist_v[loop_depth];
2149
2150 if (dump_enabled_p ())
2151 dump_printf_loc (MSG_NOTE, vect_location,
2152 "dependence distance = %d.", dist);
2153
2154 /* Same loop iteration. */
2155 if (dist == 0
2156 || (dist % vectorization_factor == 0 && dra_size == drb_size))
2157 {
2158 /* Two references with distance zero have the same alignment. */
2159 VEC_safe_push (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_a), drb);
2160 VEC_safe_push (dr_p, heap, STMT_VINFO_SAME_ALIGN_REFS (stmtinfo_b), dra);
2161 if (dump_enabled_p ())
2162 {
2163 dump_printf_loc (MSG_NOTE, vect_location,
2164 "accesses have the same alignment.");
2165 dump_printf (MSG_NOTE,
2166 "dependence distance modulo vf == 0 between ");
2167 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dra));
2168 dump_printf (MSG_NOTE, " and ");
2169 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (drb));
2170 }
2171 }
2172 }
2173 }
2174
2175
2176 /* Function vect_analyze_data_refs_alignment
2177
2178 Analyze the alignment of the data-references in the loop.
2179 Return FALSE if a data reference is found that cannot be vectorized. */
2180
2181 bool
2182 vect_analyze_data_refs_alignment (loop_vec_info loop_vinfo,
2183 bb_vec_info bb_vinfo)
2184 {
2185 if (dump_enabled_p ())
2186 dump_printf_loc (MSG_NOTE, vect_location,
2187 "=== vect_analyze_data_refs_alignment ===");
2188
2189 /* Mark groups of data references with same alignment using
2190 data dependence information. */
2191 if (loop_vinfo)
2192 {
2193 VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
2194 struct data_dependence_relation *ddr;
2195 unsigned int i;
2196
2197 FOR_EACH_VEC_ELT (ddr_p, ddrs, i, ddr)
2198 vect_find_same_alignment_drs (ddr, loop_vinfo);
2199 }
2200
2201 if (!vect_compute_data_refs_alignment (loop_vinfo, bb_vinfo))
2202 {
2203 if (dump_enabled_p ())
2204 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2205 "not vectorized: can't calculate alignment "
2206 "for data ref.");
2207 return false;
2208 }
2209
2210 return true;
2211 }
2212
2213
2214 /* Analyze groups of accesses: check that DR belongs to a group of
2215 accesses of legal size, step, etc. Detect gaps, single element
2216 interleaving, and other special cases. Set grouped access info.
2217 Collect groups of strided stores for further use in SLP analysis. */
2218
2219 static bool
2220 vect_analyze_group_access (struct data_reference *dr)
2221 {
2222 tree step = DR_STEP (dr);
2223 tree scalar_type = TREE_TYPE (DR_REF (dr));
2224 HOST_WIDE_INT type_size = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (scalar_type));
2225 gimple stmt = DR_STMT (dr);
2226 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2227 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2228 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
2229 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2230 HOST_WIDE_INT groupsize, last_accessed_element = 1;
2231 bool slp_impossible = false;
2232 struct loop *loop = NULL;
2233
2234 if (loop_vinfo)
2235 loop = LOOP_VINFO_LOOP (loop_vinfo);
2236
2237 /* For interleaving, GROUPSIZE is STEP counted in elements, i.e., the
2238 size of the interleaving group (including gaps). */
2239 groupsize = dr_step / type_size;
2240
2241 /* Not consecutive access is possible only if it is a part of interleaving. */
2242 if (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
2243 {
2244 /* Check if it this DR is a part of interleaving, and is a single
2245 element of the group that is accessed in the loop. */
2246
2247 /* Gaps are supported only for loads. STEP must be a multiple of the type
2248 size. The size of the group must be a power of 2. */
2249 if (DR_IS_READ (dr)
2250 && (dr_step % type_size) == 0
2251 && groupsize > 0
2252 && exact_log2 (groupsize) != -1)
2253 {
2254 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = stmt;
2255 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2256 if (dump_enabled_p ())
2257 {
2258 dump_printf_loc (MSG_NOTE, vect_location,
2259 "Detected single element interleaving ");
2260 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (dr));
2261 dump_printf (MSG_NOTE, " step ");
2262 dump_generic_expr (MSG_NOTE, TDF_SLIM, step);
2263 }
2264
2265 if (loop_vinfo)
2266 {
2267 if (dump_enabled_p ())
2268 dump_printf_loc (MSG_NOTE, vect_location,
2269 "Data access with gaps requires scalar "
2270 "epilogue loop");
2271 if (loop->inner)
2272 {
2273 if (dump_enabled_p ())
2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 "Peeling for outer loop is not"
2276 " supported");
2277 return false;
2278 }
2279
2280 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2281 }
2282
2283 return true;
2284 }
2285
2286 if (dump_enabled_p ())
2287 {
2288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289 "not consecutive access ");
2290 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
2291 }
2292
2293 if (bb_vinfo)
2294 {
2295 /* Mark the statement as unvectorizable. */
2296 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2297 return true;
2298 }
2299
2300 return false;
2301 }
2302
2303 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) == stmt)
2304 {
2305 /* First stmt in the interleaving chain. Check the chain. */
2306 gimple next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
2307 struct data_reference *data_ref = dr;
2308 unsigned int count = 1;
2309 tree next_step;
2310 tree prev_init = DR_INIT (data_ref);
2311 gimple prev = stmt;
2312 HOST_WIDE_INT diff, count_in_bytes, gaps = 0;
2313
2314 while (next)
2315 {
2316 /* Skip same data-refs. In case that two or more stmts share
2317 data-ref (supported only for loads), we vectorize only the first
2318 stmt, and the rest get their vectorized loads from the first
2319 one. */
2320 if (!tree_int_cst_compare (DR_INIT (data_ref),
2321 DR_INIT (STMT_VINFO_DATA_REF (
2322 vinfo_for_stmt (next)))))
2323 {
2324 if (DR_IS_WRITE (data_ref))
2325 {
2326 if (dump_enabled_p ())
2327 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328 "Two store stmts share the same dr.");
2329 return false;
2330 }
2331
2332 /* Check that there is no load-store dependencies for this loads
2333 to prevent a case of load-store-load to the same location. */
2334 if (GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (next))
2335 || GROUP_READ_WRITE_DEPENDENCE (vinfo_for_stmt (prev)))
2336 {
2337 if (dump_enabled_p ())
2338 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2339 "READ_WRITE dependence in interleaving.");
2340 return false;
2341 }
2342
2343 /* For load use the same data-ref load. */
2344 GROUP_SAME_DR_STMT (vinfo_for_stmt (next)) = prev;
2345
2346 prev = next;
2347 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2348 continue;
2349 }
2350
2351 prev = next;
2352
2353 /* Check that all the accesses have the same STEP. */
2354 next_step = DR_STEP (STMT_VINFO_DATA_REF (vinfo_for_stmt (next)));
2355 if (tree_int_cst_compare (step, next_step))
2356 {
2357 if (dump_enabled_p ())
2358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2359 "not consecutive access in interleaving");
2360 return false;
2361 }
2362
2363 data_ref = STMT_VINFO_DATA_REF (vinfo_for_stmt (next));
2364 /* Check that the distance between two accesses is equal to the type
2365 size. Otherwise, we have gaps. */
2366 diff = (TREE_INT_CST_LOW (DR_INIT (data_ref))
2367 - TREE_INT_CST_LOW (prev_init)) / type_size;
2368 if (diff != 1)
2369 {
2370 /* FORNOW: SLP of accesses with gaps is not supported. */
2371 slp_impossible = true;
2372 if (DR_IS_WRITE (data_ref))
2373 {
2374 if (dump_enabled_p ())
2375 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2376 "interleaved store with gaps");
2377 return false;
2378 }
2379
2380 gaps += diff - 1;
2381 }
2382
2383 last_accessed_element += diff;
2384
2385 /* Store the gap from the previous member of the group. If there is no
2386 gap in the access, GROUP_GAP is always 1. */
2387 GROUP_GAP (vinfo_for_stmt (next)) = diff;
2388
2389 prev_init = DR_INIT (data_ref);
2390 next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
2391 /* Count the number of data-refs in the chain. */
2392 count++;
2393 }
2394
2395 /* COUNT is the number of accesses found, we multiply it by the size of
2396 the type to get COUNT_IN_BYTES. */
2397 count_in_bytes = type_size * count;
2398
2399 /* Check that the size of the interleaving (including gaps) is not
2400 greater than STEP. */
2401 if (dr_step && dr_step < count_in_bytes + gaps * type_size)
2402 {
2403 if (dump_enabled_p ())
2404 {
2405 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2406 "interleaving size is greater than step for ");
2407 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, DR_REF (dr));
2408 }
2409 return false;
2410 }
2411
2412 /* Check that the size of the interleaving is equal to STEP for stores,
2413 i.e., that there are no gaps. */
2414 if (dr_step && dr_step != count_in_bytes)
2415 {
2416 if (DR_IS_READ (dr))
2417 {
2418 slp_impossible = true;
2419 /* There is a gap after the last load in the group. This gap is a
2420 difference between the groupsize and the number of elements.
2421 When there is no gap, this difference should be 0. */
2422 GROUP_GAP (vinfo_for_stmt (stmt)) = groupsize - count;
2423 }
2424 else
2425 {
2426 if (dump_enabled_p ())
2427 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2428 "interleaved store with gaps");
2429 return false;
2430 }
2431 }
2432
2433 /* Check that STEP is a multiple of type size. */
2434 if (dr_step && (dr_step % type_size) != 0)
2435 {
2436 if (dump_enabled_p ())
2437 {
2438 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2439 "step is not a multiple of type size: step ");
2440 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, step);
2441 dump_printf (MSG_MISSED_OPTIMIZATION, " size ");
2442 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
2443 TYPE_SIZE_UNIT (scalar_type));
2444 }
2445 return false;
2446 }
2447
2448 if (groupsize == 0)
2449 groupsize = count;
2450
2451 GROUP_SIZE (vinfo_for_stmt (stmt)) = groupsize;
2452 if (dump_enabled_p ())
2453 dump_printf_loc (MSG_NOTE, vect_location,
2454 "Detected interleaving of size %d", (int)groupsize);
2455
2456 /* SLP: create an SLP data structure for every interleaving group of
2457 stores for further analysis in vect_analyse_slp. */
2458 if (DR_IS_WRITE (dr) && !slp_impossible)
2459 {
2460 if (loop_vinfo)
2461 VEC_safe_push (gimple, heap, LOOP_VINFO_GROUPED_STORES (loop_vinfo),
2462 stmt);
2463 if (bb_vinfo)
2464 VEC_safe_push (gimple, heap, BB_VINFO_GROUPED_STORES (bb_vinfo),
2465 stmt);
2466 }
2467
2468 /* There is a gap in the end of the group. */
2469 if (groupsize - last_accessed_element > 0 && loop_vinfo)
2470 {
2471 if (dump_enabled_p ())
2472 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2473 "Data access with gaps requires scalar "
2474 "epilogue loop");
2475 if (loop->inner)
2476 {
2477 if (dump_enabled_p ())
2478 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2479 "Peeling for outer loop is not supported");
2480 return false;
2481 }
2482
2483 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = true;
2484 }
2485 }
2486
2487 return true;
2488 }
2489
2490
2491 /* Analyze the access pattern of the data-reference DR.
2492 In case of non-consecutive accesses call vect_analyze_group_access() to
2493 analyze groups of accesses. */
2494
2495 static bool
2496 vect_analyze_data_ref_access (struct data_reference *dr)
2497 {
2498 tree step = DR_STEP (dr);
2499 tree scalar_type = TREE_TYPE (DR_REF (dr));
2500 gimple stmt = DR_STMT (dr);
2501 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2502 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
2503 struct loop *loop = NULL;
2504
2505 if (loop_vinfo)
2506 loop = LOOP_VINFO_LOOP (loop_vinfo);
2507
2508 if (loop_vinfo && !step)
2509 {
2510 if (dump_enabled_p ())
2511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2512 "bad data-ref access in loop");
2513 return false;
2514 }
2515
2516 /* Allow invariant loads in loops. */
2517 if (loop_vinfo && integer_zerop (step))
2518 {
2519 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2520 return DR_IS_READ (dr);
2521 }
2522
2523 if (loop && nested_in_vect_loop_p (loop, stmt))
2524 {
2525 /* Interleaved accesses are not yet supported within outer-loop
2526 vectorization for references in the inner-loop. */
2527 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2528
2529 /* For the rest of the analysis we use the outer-loop step. */
2530 step = STMT_VINFO_DR_STEP (stmt_info);
2531 if (integer_zerop (step))
2532 {
2533 if (dump_enabled_p ())
2534 dump_printf_loc (MSG_NOTE, vect_location,
2535 "zero step in outer loop.");
2536 if (DR_IS_READ (dr))
2537 return true;
2538 else
2539 return false;
2540 }
2541 }
2542
2543 /* Consecutive? */
2544 if (TREE_CODE (step) == INTEGER_CST)
2545 {
2546 HOST_WIDE_INT dr_step = TREE_INT_CST_LOW (step);
2547 if (!tree_int_cst_compare (step, TYPE_SIZE_UNIT (scalar_type))
2548 || (dr_step < 0
2549 && !compare_tree_int (TYPE_SIZE_UNIT (scalar_type), -dr_step)))
2550 {
2551 /* Mark that it is not interleaving. */
2552 GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) = NULL;
2553 return true;
2554 }
2555 }
2556
2557 if (loop && nested_in_vect_loop_p (loop, stmt))
2558 {
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_NOTE, vect_location,
2561 "grouped access in outer loop.");
2562 return false;
2563 }
2564
2565 /* Assume this is a DR handled by non-constant strided load case. */
2566 if (TREE_CODE (step) != INTEGER_CST)
2567 return STMT_VINFO_STRIDE_LOAD_P (stmt_info);
2568
2569 /* Not consecutive access - check if it's a part of interleaving group. */
2570 return vect_analyze_group_access (dr);
2571 }
2572
2573
2574 /* Function vect_analyze_data_ref_accesses.
2575
2576 Analyze the access pattern of all the data references in the loop.
2577
2578 FORNOW: the only access pattern that is considered vectorizable is a
2579 simple step 1 (consecutive) access.
2580
2581 FORNOW: handle only arrays and pointer accesses. */
2582
2583 bool
2584 vect_analyze_data_ref_accesses (loop_vec_info loop_vinfo, bb_vec_info bb_vinfo)
2585 {
2586 unsigned int i;
2587 VEC (data_reference_p, heap) *datarefs;
2588 struct data_reference *dr;
2589
2590 if (dump_enabled_p ())
2591 dump_printf_loc (MSG_NOTE, vect_location,
2592 "=== vect_analyze_data_ref_accesses ===");
2593
2594 if (loop_vinfo)
2595 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2596 else
2597 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
2598
2599 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
2600 if (STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr)))
2601 && !vect_analyze_data_ref_access (dr))
2602 {
2603 if (dump_enabled_p ())
2604 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2605 "not vectorized: complicated access pattern.");
2606
2607 if (bb_vinfo)
2608 {
2609 /* Mark the statement as not vectorizable. */
2610 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (DR_STMT (dr))) = false;
2611 continue;
2612 }
2613 else
2614 return false;
2615 }
2616
2617 return true;
2618 }
2619
2620 /* Function vect_prune_runtime_alias_test_list.
2621
2622 Prune a list of ddrs to be tested at run-time by versioning for alias.
2623 Return FALSE if resulting list of ddrs is longer then allowed by
2624 PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS, otherwise return TRUE. */
2625
2626 bool
2627 vect_prune_runtime_alias_test_list (loop_vec_info loop_vinfo)
2628 {
2629 VEC (ddr_p, heap) * ddrs =
2630 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo);
2631 unsigned i, j;
2632
2633 if (dump_enabled_p ())
2634 dump_printf_loc (MSG_NOTE, vect_location,
2635 "=== vect_prune_runtime_alias_test_list ===");
2636
2637 for (i = 0; i < VEC_length (ddr_p, ddrs); )
2638 {
2639 bool found;
2640 ddr_p ddr_i;
2641
2642 ddr_i = VEC_index (ddr_p, ddrs, i);
2643 found = false;
2644
2645 for (j = 0; j < i; j++)
2646 {
2647 ddr_p ddr_j = VEC_index (ddr_p, ddrs, j);
2648
2649 if (vect_vfa_range_equal (ddr_i, ddr_j))
2650 {
2651 if (dump_enabled_p ())
2652 {
2653 dump_printf_loc (MSG_NOTE, vect_location,
2654 "found equal ranges ");
2655 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_i)));
2656 dump_printf (MSG_NOTE, ", ");
2657 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_i)));
2658 dump_printf (MSG_NOTE, " and ");
2659 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_A (ddr_j)));
2660 dump_printf (MSG_NOTE, ", ");
2661 dump_generic_expr (MSG_NOTE, TDF_SLIM, DR_REF (DDR_B (ddr_j)));
2662 }
2663 found = true;
2664 break;
2665 }
2666 }
2667
2668 if (found)
2669 {
2670 VEC_ordered_remove (ddr_p, ddrs, i);
2671 continue;
2672 }
2673 i++;
2674 }
2675
2676 if (VEC_length (ddr_p, ddrs) >
2677 (unsigned) PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS))
2678 {
2679 if (dump_enabled_p ())
2680 {
2681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2682 "disable versioning for alias - max number of "
2683 "generated checks exceeded.");
2684 }
2685
2686 VEC_truncate (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo), 0);
2687
2688 return false;
2689 }
2690
2691 return true;
2692 }
2693
2694 /* Check whether a non-affine read in stmt is suitable for gather load
2695 and if so, return a builtin decl for that operation. */
2696
2697 tree
2698 vect_check_gather (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2699 tree *offp, int *scalep)
2700 {
2701 HOST_WIDE_INT scale = 1, pbitpos, pbitsize;
2702 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2703 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2704 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2705 tree offtype = NULL_TREE;
2706 tree decl, base, off;
2707 enum machine_mode pmode;
2708 int punsignedp, pvolatilep;
2709
2710 /* The gather builtins need address of the form
2711 loop_invariant + vector * {1, 2, 4, 8}
2712 or
2713 loop_invariant + sign_extend (vector) * { 1, 2, 4, 8 }.
2714 Unfortunately DR_BASE_ADDRESS/DR_OFFSET can be a mixture
2715 of loop invariants/SSA_NAMEs defined in the loop, with casts,
2716 multiplications and additions in it. To get a vector, we need
2717 a single SSA_NAME that will be defined in the loop and will
2718 contain everything that is not loop invariant and that can be
2719 vectorized. The following code attempts to find such a preexistng
2720 SSA_NAME OFF and put the loop invariants into a tree BASE
2721 that can be gimplified before the loop. */
2722 base = get_inner_reference (DR_REF (dr), &pbitsize, &pbitpos, &off,
2723 &pmode, &punsignedp, &pvolatilep, false);
2724 gcc_assert (base != NULL_TREE && (pbitpos % BITS_PER_UNIT) == 0);
2725
2726 if (TREE_CODE (base) == MEM_REF)
2727 {
2728 if (!integer_zerop (TREE_OPERAND (base, 1)))
2729 {
2730 if (off == NULL_TREE)
2731 {
2732 double_int moff = mem_ref_offset (base);
2733 off = double_int_to_tree (sizetype, moff);
2734 }
2735 else
2736 off = size_binop (PLUS_EXPR, off,
2737 fold_convert (sizetype, TREE_OPERAND (base, 1)));
2738 }
2739 base = TREE_OPERAND (base, 0);
2740 }
2741 else
2742 base = build_fold_addr_expr (base);
2743
2744 if (off == NULL_TREE)
2745 off = size_zero_node;
2746
2747 /* If base is not loop invariant, either off is 0, then we start with just
2748 the constant offset in the loop invariant BASE and continue with base
2749 as OFF, otherwise give up.
2750 We could handle that case by gimplifying the addition of base + off
2751 into some SSA_NAME and use that as off, but for now punt. */
2752 if (!expr_invariant_in_loop_p (loop, base))
2753 {
2754 if (!integer_zerop (off))
2755 return NULL_TREE;
2756 off = base;
2757 base = size_int (pbitpos / BITS_PER_UNIT);
2758 }
2759 /* Otherwise put base + constant offset into the loop invariant BASE
2760 and continue with OFF. */
2761 else
2762 {
2763 base = fold_convert (sizetype, base);
2764 base = size_binop (PLUS_EXPR, base, size_int (pbitpos / BITS_PER_UNIT));
2765 }
2766
2767 /* OFF at this point may be either a SSA_NAME or some tree expression
2768 from get_inner_reference. Try to peel off loop invariants from it
2769 into BASE as long as possible. */
2770 STRIP_NOPS (off);
2771 while (offtype == NULL_TREE)
2772 {
2773 enum tree_code code;
2774 tree op0, op1, add = NULL_TREE;
2775
2776 if (TREE_CODE (off) == SSA_NAME)
2777 {
2778 gimple def_stmt = SSA_NAME_DEF_STMT (off);
2779
2780 if (expr_invariant_in_loop_p (loop, off))
2781 return NULL_TREE;
2782
2783 if (gimple_code (def_stmt) != GIMPLE_ASSIGN)
2784 break;
2785
2786 op0 = gimple_assign_rhs1 (def_stmt);
2787 code = gimple_assign_rhs_code (def_stmt);
2788 op1 = gimple_assign_rhs2 (def_stmt);
2789 }
2790 else
2791 {
2792 if (get_gimple_rhs_class (TREE_CODE (off)) == GIMPLE_TERNARY_RHS)
2793 return NULL_TREE;
2794 code = TREE_CODE (off);
2795 extract_ops_from_tree (off, &code, &op0, &op1);
2796 }
2797 switch (code)
2798 {
2799 case POINTER_PLUS_EXPR:
2800 case PLUS_EXPR:
2801 if (expr_invariant_in_loop_p (loop, op0))
2802 {
2803 add = op0;
2804 off = op1;
2805 do_add:
2806 add = fold_convert (sizetype, add);
2807 if (scale != 1)
2808 add = size_binop (MULT_EXPR, add, size_int (scale));
2809 base = size_binop (PLUS_EXPR, base, add);
2810 continue;
2811 }
2812 if (expr_invariant_in_loop_p (loop, op1))
2813 {
2814 add = op1;
2815 off = op0;
2816 goto do_add;
2817 }
2818 break;
2819 case MINUS_EXPR:
2820 if (expr_invariant_in_loop_p (loop, op1))
2821 {
2822 add = fold_convert (sizetype, op1);
2823 add = size_binop (MINUS_EXPR, size_zero_node, add);
2824 off = op0;
2825 goto do_add;
2826 }
2827 break;
2828 case MULT_EXPR:
2829 if (scale == 1 && host_integerp (op1, 0))
2830 {
2831 scale = tree_low_cst (op1, 0);
2832 off = op0;
2833 continue;
2834 }
2835 break;
2836 case SSA_NAME:
2837 off = op0;
2838 continue;
2839 CASE_CONVERT:
2840 if (!POINTER_TYPE_P (TREE_TYPE (op0))
2841 && !INTEGRAL_TYPE_P (TREE_TYPE (op0)))
2842 break;
2843 if (TYPE_PRECISION (TREE_TYPE (op0))
2844 == TYPE_PRECISION (TREE_TYPE (off)))
2845 {
2846 off = op0;
2847 continue;
2848 }
2849 if (TYPE_PRECISION (TREE_TYPE (op0))
2850 < TYPE_PRECISION (TREE_TYPE (off)))
2851 {
2852 off = op0;
2853 offtype = TREE_TYPE (off);
2854 STRIP_NOPS (off);
2855 continue;
2856 }
2857 break;
2858 default:
2859 break;
2860 }
2861 break;
2862 }
2863
2864 /* If at the end OFF still isn't a SSA_NAME or isn't
2865 defined in the loop, punt. */
2866 if (TREE_CODE (off) != SSA_NAME
2867 || expr_invariant_in_loop_p (loop, off))
2868 return NULL_TREE;
2869
2870 if (offtype == NULL_TREE)
2871 offtype = TREE_TYPE (off);
2872
2873 decl = targetm.vectorize.builtin_gather (STMT_VINFO_VECTYPE (stmt_info),
2874 offtype, scale);
2875 if (decl == NULL_TREE)
2876 return NULL_TREE;
2877
2878 if (basep)
2879 *basep = base;
2880 if (offp)
2881 *offp = off;
2882 if (scalep)
2883 *scalep = scale;
2884 return decl;
2885 }
2886
2887 /* Check wether a non-affine load in STMT (being in the loop referred to
2888 in LOOP_VINFO) is suitable for handling as strided load. That is the case
2889 if its address is a simple induction variable. If so return the base
2890 of that induction variable in *BASEP and the (loop-invariant) step
2891 in *STEPP, both only when that pointer is non-zero.
2892
2893 This handles ARRAY_REFs (with variant index) and MEM_REFs (with variant
2894 base pointer) only. */
2895
2896 bool
2897 vect_check_strided_load (gimple stmt, loop_vec_info loop_vinfo, tree *basep,
2898 tree *stepp)
2899 {
2900 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2901 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2902 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
2903 tree base, off;
2904 affine_iv iv;
2905
2906 if (!DR_IS_READ (dr))
2907 return false;
2908
2909 base = DR_REF (dr);
2910
2911 if (TREE_CODE (base) == ARRAY_REF)
2912 {
2913 off = TREE_OPERAND (base, 1);
2914 base = TREE_OPERAND (base, 0);
2915 }
2916 else if (TREE_CODE (base) == MEM_REF)
2917 {
2918 off = TREE_OPERAND (base, 0);
2919 base = TREE_OPERAND (base, 1);
2920 }
2921 else
2922 return false;
2923
2924 if (TREE_CODE (off) != SSA_NAME)
2925 return false;
2926
2927 if (!expr_invariant_in_loop_p (loop, base)
2928 || !simple_iv (loop, loop_containing_stmt (stmt), off, &iv, true))
2929 return false;
2930
2931 if (basep)
2932 *basep = iv.base;
2933 if (stepp)
2934 *stepp = iv.step;
2935 return true;
2936 }
2937
2938 /* Function vect_analyze_data_refs.
2939
2940 Find all the data references in the loop or basic block.
2941
2942 The general structure of the analysis of data refs in the vectorizer is as
2943 follows:
2944 1- vect_analyze_data_refs(loop/bb): call
2945 compute_data_dependences_for_loop/bb to find and analyze all data-refs
2946 in the loop/bb and their dependences.
2947 2- vect_analyze_dependences(): apply dependence testing using ddrs.
2948 3- vect_analyze_drs_alignment(): check that ref_stmt.alignment is ok.
2949 4- vect_analyze_drs_access(): check that ref_stmt.step is ok.
2950
2951 */
2952
2953 bool
2954 vect_analyze_data_refs (loop_vec_info loop_vinfo,
2955 bb_vec_info bb_vinfo,
2956 int *min_vf)
2957 {
2958 struct loop *loop = NULL;
2959 basic_block bb = NULL;
2960 unsigned int i;
2961 VEC (data_reference_p, heap) *datarefs;
2962 struct data_reference *dr;
2963 tree scalar_type;
2964 bool res, stop_bb_analysis = false;
2965
2966 if (dump_enabled_p ())
2967 dump_printf_loc (MSG_NOTE, vect_location,
2968 "=== vect_analyze_data_refs ===\n");
2969
2970 if (loop_vinfo)
2971 {
2972 loop = LOOP_VINFO_LOOP (loop_vinfo);
2973 res = compute_data_dependences_for_loop
2974 (loop, true,
2975 &LOOP_VINFO_LOOP_NEST (loop_vinfo),
2976 &LOOP_VINFO_DATAREFS (loop_vinfo),
2977 &LOOP_VINFO_DDRS (loop_vinfo));
2978
2979 if (!res)
2980 {
2981 if (dump_enabled_p ())
2982 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2983 "not vectorized: loop contains function calls"
2984 " or data references that cannot be analyzed");
2985 return false;
2986 }
2987
2988 datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2989 }
2990 else
2991 {
2992 gimple_stmt_iterator gsi;
2993
2994 bb = BB_VINFO_BB (bb_vinfo);
2995 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
2996 {
2997 gimple stmt = gsi_stmt (gsi);
2998 if (!find_data_references_in_stmt (NULL, stmt,
2999 &BB_VINFO_DATAREFS (bb_vinfo)))
3000 {
3001 /* Mark the rest of the basic-block as unvectorizable. */
3002 for (; !gsi_end_p (gsi); gsi_next (&gsi))
3003 {
3004 stmt = gsi_stmt (gsi);
3005 STMT_VINFO_VECTORIZABLE (vinfo_for_stmt (stmt)) = false;
3006 }
3007 break;
3008 }
3009 }
3010 if (!compute_all_dependences (BB_VINFO_DATAREFS (bb_vinfo),
3011 &BB_VINFO_DDRS (bb_vinfo), NULL, true))
3012 {
3013 if (dump_enabled_p ())
3014 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3015 "not vectorized: basic block contains function"
3016 " calls or data references that cannot be"
3017 " analyzed");
3018 return false;
3019 }
3020
3021 datarefs = BB_VINFO_DATAREFS (bb_vinfo);
3022 }
3023
3024 /* Go through the data-refs, check that the analysis succeeded. Update
3025 pointer from stmt_vec_info struct to DR and vectype. */
3026
3027 FOR_EACH_VEC_ELT (data_reference_p, datarefs, i, dr)
3028 {
3029 gimple stmt;
3030 stmt_vec_info stmt_info;
3031 tree base, offset, init;
3032 bool gather = false;
3033 int vf;
3034
3035 if (!dr || !DR_REF (dr))
3036 {
3037 if (dump_enabled_p ())
3038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3039 "not vectorized: unhandled data-ref ");
3040 return false;
3041 }
3042
3043 stmt = DR_STMT (dr);
3044 stmt_info = vinfo_for_stmt (stmt);
3045
3046 if (stop_bb_analysis)
3047 {
3048 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3049 continue;
3050 }
3051
3052 /* Check that analysis of the data-ref succeeded. */
3053 if (!DR_BASE_ADDRESS (dr) || !DR_OFFSET (dr) || !DR_INIT (dr)
3054 || !DR_STEP (dr))
3055 {
3056 /* If target supports vector gather loads, see if they can't
3057 be used. */
3058 if (loop_vinfo
3059 && DR_IS_READ (dr)
3060 && !TREE_THIS_VOLATILE (DR_REF (dr))
3061 && targetm.vectorize.builtin_gather != NULL
3062 && !nested_in_vect_loop_p (loop, stmt))
3063 {
3064 struct data_reference *newdr
3065 = create_data_ref (NULL, loop_containing_stmt (stmt),
3066 DR_REF (dr), stmt, true);
3067 gcc_assert (newdr != NULL && DR_REF (newdr));
3068 if (DR_BASE_ADDRESS (newdr)
3069 && DR_OFFSET (newdr)
3070 && DR_INIT (newdr)
3071 && DR_STEP (newdr)
3072 && integer_zerop (DR_STEP (newdr)))
3073 {
3074 dr = newdr;
3075 gather = true;
3076 }
3077 else
3078 free_data_ref (newdr);
3079 }
3080
3081 if (!gather)
3082 {
3083 if (dump_enabled_p ())
3084 {
3085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3086 "not vectorized: data ref analysis "
3087 "failed ");
3088 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3089 }
3090
3091 if (bb_vinfo)
3092 {
3093 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3094 stop_bb_analysis = true;
3095 continue;
3096 }
3097
3098 return false;
3099 }
3100 }
3101
3102 if (TREE_CODE (DR_BASE_ADDRESS (dr)) == INTEGER_CST)
3103 {
3104 if (dump_enabled_p ())
3105 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3106 "not vectorized: base addr of dr is a "
3107 "constant");
3108
3109 if (bb_vinfo)
3110 {
3111 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3112 stop_bb_analysis = true;
3113 continue;
3114 }
3115
3116 if (gather)
3117 free_data_ref (dr);
3118 return false;
3119 }
3120
3121 if (TREE_THIS_VOLATILE (DR_REF (dr)))
3122 {
3123 if (dump_enabled_p ())
3124 {
3125 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3126 "not vectorized: volatile type ");
3127 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3128 }
3129
3130 if (bb_vinfo)
3131 {
3132 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3133 stop_bb_analysis = true;
3134 continue;
3135 }
3136
3137 return false;
3138 }
3139
3140 if (stmt_can_throw_internal (stmt))
3141 {
3142 if (dump_enabled_p ())
3143 {
3144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3145 "not vectorized: statement can throw an "
3146 "exception ");
3147 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3148 }
3149
3150 if (bb_vinfo)
3151 {
3152 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3153 stop_bb_analysis = true;
3154 continue;
3155 }
3156
3157 if (gather)
3158 free_data_ref (dr);
3159 return false;
3160 }
3161
3162 if (TREE_CODE (DR_REF (dr)) == COMPONENT_REF
3163 && DECL_BIT_FIELD (TREE_OPERAND (DR_REF (dr), 1)))
3164 {
3165 if (dump_enabled_p ())
3166 {
3167 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3168 "not vectorized: statement is bitfield "
3169 "access ");
3170 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3171 }
3172
3173 if (bb_vinfo)
3174 {
3175 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3176 stop_bb_analysis = true;
3177 continue;
3178 }
3179
3180 if (gather)
3181 free_data_ref (dr);
3182 return false;
3183 }
3184
3185 base = unshare_expr (DR_BASE_ADDRESS (dr));
3186 offset = unshare_expr (DR_OFFSET (dr));
3187 init = unshare_expr (DR_INIT (dr));
3188
3189 if (is_gimple_call (stmt))
3190 {
3191 if (dump_enabled_p ())
3192 {
3193 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3194 "not vectorized: dr in a call ");
3195 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3196 }
3197
3198 if (bb_vinfo)
3199 {
3200 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3201 stop_bb_analysis = true;
3202 continue;
3203 }
3204
3205 if (gather)
3206 free_data_ref (dr);
3207 return false;
3208 }
3209
3210 /* Update DR field in stmt_vec_info struct. */
3211
3212 /* If the dataref is in an inner-loop of the loop that is considered for
3213 for vectorization, we also want to analyze the access relative to
3214 the outer-loop (DR contains information only relative to the
3215 inner-most enclosing loop). We do that by building a reference to the
3216 first location accessed by the inner-loop, and analyze it relative to
3217 the outer-loop. */
3218 if (loop && nested_in_vect_loop_p (loop, stmt))
3219 {
3220 tree outer_step, outer_base, outer_init;
3221 HOST_WIDE_INT pbitsize, pbitpos;
3222 tree poffset;
3223 enum machine_mode pmode;
3224 int punsignedp, pvolatilep;
3225 affine_iv base_iv, offset_iv;
3226 tree dinit;
3227
3228 /* Build a reference to the first location accessed by the
3229 inner-loop: *(BASE+INIT). (The first location is actually
3230 BASE+INIT+OFFSET, but we add OFFSET separately later). */
3231 tree inner_base = build_fold_indirect_ref
3232 (fold_build_pointer_plus (base, init));
3233
3234 if (dump_enabled_p ())
3235 {
3236 dump_printf_loc (MSG_NOTE, vect_location,
3237 "analyze in outer-loop: ");
3238 dump_generic_expr (MSG_NOTE, TDF_SLIM, inner_base);
3239 }
3240
3241 outer_base = get_inner_reference (inner_base, &pbitsize, &pbitpos,
3242 &poffset, &pmode, &punsignedp, &pvolatilep, false);
3243 gcc_assert (outer_base != NULL_TREE);
3244
3245 if (pbitpos % BITS_PER_UNIT != 0)
3246 {
3247 if (dump_enabled_p ())
3248 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3249 "failed: bit offset alignment.\n");
3250 return false;
3251 }
3252
3253 outer_base = build_fold_addr_expr (outer_base);
3254 if (!simple_iv (loop, loop_containing_stmt (stmt), outer_base,
3255 &base_iv, false))
3256 {
3257 if (dump_enabled_p ())
3258 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3259 "failed: evolution of base is not affine.\n");
3260 return false;
3261 }
3262
3263 if (offset)
3264 {
3265 if (poffset)
3266 poffset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset,
3267 poffset);
3268 else
3269 poffset = offset;
3270 }
3271
3272 if (!poffset)
3273 {
3274 offset_iv.base = ssize_int (0);
3275 offset_iv.step = ssize_int (0);
3276 }
3277 else if (!simple_iv (loop, loop_containing_stmt (stmt), poffset,
3278 &offset_iv, false))
3279 {
3280 if (dump_enabled_p ())
3281 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3282 "evolution of offset is not affine.\n");
3283 return false;
3284 }
3285
3286 outer_init = ssize_int (pbitpos / BITS_PER_UNIT);
3287 split_constant_offset (base_iv.base, &base_iv.base, &dinit);
3288 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3289 split_constant_offset (offset_iv.base, &offset_iv.base, &dinit);
3290 outer_init = size_binop (PLUS_EXPR, outer_init, dinit);
3291
3292 outer_step = size_binop (PLUS_EXPR,
3293 fold_convert (ssizetype, base_iv.step),
3294 fold_convert (ssizetype, offset_iv.step));
3295
3296 STMT_VINFO_DR_STEP (stmt_info) = outer_step;
3297 /* FIXME: Use canonicalize_base_object_address (base_iv.base); */
3298 STMT_VINFO_DR_BASE_ADDRESS (stmt_info) = base_iv.base;
3299 STMT_VINFO_DR_INIT (stmt_info) = outer_init;
3300 STMT_VINFO_DR_OFFSET (stmt_info) =
3301 fold_convert (ssizetype, offset_iv.base);
3302 STMT_VINFO_DR_ALIGNED_TO (stmt_info) =
3303 size_int (highest_pow2_factor (offset_iv.base));
3304
3305 if (dump_enabled_p ())
3306 {
3307 dump_printf_loc (MSG_NOTE, vect_location,
3308 "\touter base_address: ");
3309 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3310 STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3311 dump_printf (MSG_NOTE, "\n\touter offset from base address: ");
3312 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3313 STMT_VINFO_DR_OFFSET (stmt_info));
3314 dump_printf (MSG_NOTE,
3315 "\n\touter constant offset from base address: ");
3316 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3317 STMT_VINFO_DR_INIT (stmt_info));
3318 dump_printf (MSG_NOTE, "\n\touter step: ");
3319 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3320 STMT_VINFO_DR_STEP (stmt_info));
3321 dump_printf (MSG_NOTE, "\n\touter aligned to: ");
3322 dump_generic_expr (MSG_NOTE, TDF_SLIM,
3323 STMT_VINFO_DR_ALIGNED_TO (stmt_info));
3324 }
3325 }
3326
3327 if (STMT_VINFO_DATA_REF (stmt_info))
3328 {
3329 if (dump_enabled_p ())
3330 {
3331 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3332 "not vectorized: more than one data ref "
3333 "in stmt: ");
3334 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3335 }
3336
3337 if (bb_vinfo)
3338 {
3339 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3340 stop_bb_analysis = true;
3341 continue;
3342 }
3343
3344 if (gather)
3345 free_data_ref (dr);
3346 return false;
3347 }
3348
3349 STMT_VINFO_DATA_REF (stmt_info) = dr;
3350
3351 /* Set vectype for STMT. */
3352 scalar_type = TREE_TYPE (DR_REF (dr));
3353 STMT_VINFO_VECTYPE (stmt_info) =
3354 get_vectype_for_scalar_type (scalar_type);
3355 if (!STMT_VINFO_VECTYPE (stmt_info))
3356 {
3357 if (dump_enabled_p ())
3358 {
3359 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3360 "not vectorized: no vectype for stmt: ");
3361 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3362 dump_printf (MSG_MISSED_OPTIMIZATION, " scalar_type: ");
3363 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_DETAILS,
3364 scalar_type);
3365 }
3366
3367 if (bb_vinfo)
3368 {
3369 /* Mark the statement as not vectorizable. */
3370 STMT_VINFO_VECTORIZABLE (stmt_info) = false;
3371 stop_bb_analysis = true;
3372 continue;
3373 }
3374
3375 if (gather)
3376 {
3377 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3378 free_data_ref (dr);
3379 }
3380 return false;
3381 }
3382
3383 /* Adjust the minimal vectorization factor according to the
3384 vector type. */
3385 vf = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
3386 if (vf > *min_vf)
3387 *min_vf = vf;
3388
3389 if (gather)
3390 {
3391 unsigned int j, k, n;
3392 struct data_reference *olddr
3393 = VEC_index (data_reference_p, datarefs, i);
3394 VEC (ddr_p, heap) *ddrs = LOOP_VINFO_DDRS (loop_vinfo);
3395 struct data_dependence_relation *ddr, *newddr;
3396 bool bad = false;
3397 tree off;
3398 VEC (loop_p, heap) *nest = LOOP_VINFO_LOOP_NEST (loop_vinfo);
3399
3400 gather = 0 != vect_check_gather (stmt, loop_vinfo, NULL, &off, NULL);
3401 if (gather
3402 && get_vectype_for_scalar_type (TREE_TYPE (off)) == NULL_TREE)
3403 gather = false;
3404 if (!gather)
3405 {
3406 STMT_VINFO_DATA_REF (stmt_info) = NULL;
3407 free_data_ref (dr);
3408 if (dump_enabled_p ())
3409 {
3410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3411 "not vectorized: not suitable for gather "
3412 "load ");
3413 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3414 }
3415 return false;
3416 }
3417
3418 n = VEC_length (data_reference_p, datarefs) - 1;
3419 for (j = 0, k = i - 1; j < i; j++)
3420 {
3421 ddr = VEC_index (ddr_p, ddrs, k);
3422 gcc_assert (DDR_B (ddr) == olddr);
3423 newddr = initialize_data_dependence_relation (DDR_A (ddr), dr,
3424 nest);
3425 VEC_replace (ddr_p, ddrs, k, newddr);
3426 free_dependence_relation (ddr);
3427 if (!bad
3428 && DR_IS_WRITE (DDR_A (newddr))
3429 && DDR_ARE_DEPENDENT (newddr) != chrec_known)
3430 bad = true;
3431 k += --n;
3432 }
3433
3434 k++;
3435 n = k + VEC_length (data_reference_p, datarefs) - i - 1;
3436 for (; k < n; k++)
3437 {
3438 ddr = VEC_index (ddr_p, ddrs, k);
3439 gcc_assert (DDR_A (ddr) == olddr);
3440 newddr = initialize_data_dependence_relation (dr, DDR_B (ddr),
3441 nest);
3442 VEC_replace (ddr_p, ddrs, k, newddr);
3443 free_dependence_relation (ddr);
3444 if (!bad
3445 && DR_IS_WRITE (DDR_B (newddr))
3446 && DDR_ARE_DEPENDENT (newddr) != chrec_known)
3447 bad = true;
3448 }
3449
3450 k = VEC_length (ddr_p, ddrs)
3451 - VEC_length (data_reference_p, datarefs) + i;
3452 ddr = VEC_index (ddr_p, ddrs, k);
3453 gcc_assert (DDR_A (ddr) == olddr && DDR_B (ddr) == olddr);
3454 newddr = initialize_data_dependence_relation (dr, dr, nest);
3455 VEC_replace (ddr_p, ddrs, k, newddr);
3456 free_dependence_relation (ddr);
3457 VEC_replace (data_reference_p, datarefs, i, dr);
3458
3459 if (bad)
3460 {
3461 if (dump_enabled_p ())
3462 {
3463 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3464 "not vectorized: data dependence conflict"
3465 " prevents gather load");
3466 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3467 }
3468 return false;
3469 }
3470
3471 STMT_VINFO_GATHER_P (stmt_info) = true;
3472 }
3473 else if (loop_vinfo
3474 && TREE_CODE (DR_STEP (dr)) != INTEGER_CST)
3475 {
3476 bool strided_load = false;
3477 if (!nested_in_vect_loop_p (loop, stmt))
3478 strided_load
3479 = vect_check_strided_load (stmt, loop_vinfo, NULL, NULL);
3480 if (!strided_load)
3481 {
3482 if (dump_enabled_p ())
3483 {
3484 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3485 "not vectorized: not suitable for strided "
3486 "load ");
3487 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
3488 }
3489 return false;
3490 }
3491 STMT_VINFO_STRIDE_LOAD_P (stmt_info) = true;
3492 }
3493 }
3494
3495 return true;
3496 }
3497
3498
3499 /* Function vect_get_new_vect_var.
3500
3501 Returns a name for a new variable. The current naming scheme appends the
3502 prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to
3503 the name of vectorizer generated variables, and appends that to NAME if
3504 provided. */
3505
3506 tree
3507 vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name)
3508 {
3509 const char *prefix;
3510 tree new_vect_var;
3511
3512 switch (var_kind)
3513 {
3514 case vect_simple_var:
3515 prefix = "vect_";
3516 break;
3517 case vect_scalar_var:
3518 prefix = "stmp_";
3519 break;
3520 case vect_pointer_var:
3521 prefix = "vect_p";
3522 break;
3523 default:
3524 gcc_unreachable ();
3525 }
3526
3527 if (name)
3528 {
3529 char* tmp = concat (prefix, name, NULL);
3530 new_vect_var = create_tmp_reg (type, tmp);
3531 free (tmp);
3532 }
3533 else
3534 new_vect_var = create_tmp_reg (type, prefix);
3535
3536 return new_vect_var;
3537 }
3538
3539
3540 /* Function vect_create_addr_base_for_vector_ref.
3541
3542 Create an expression that computes the address of the first memory location
3543 that will be accessed for a data reference.
3544
3545 Input:
3546 STMT: The statement containing the data reference.
3547 NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list.
3548 OFFSET: Optional. If supplied, it is be added to the initial address.
3549 LOOP: Specify relative to which loop-nest should the address be computed.
3550 For example, when the dataref is in an inner-loop nested in an
3551 outer-loop that is now being vectorized, LOOP can be either the
3552 outer-loop, or the inner-loop. The first memory location accessed
3553 by the following dataref ('in' points to short):
3554
3555 for (i=0; i<N; i++)
3556 for (j=0; j<M; j++)
3557 s += in[i+j]
3558
3559 is as follows:
3560 if LOOP=i_loop: &in (relative to i_loop)
3561 if LOOP=j_loop: &in+i*2B (relative to j_loop)
3562
3563 Output:
3564 1. Return an SSA_NAME whose value is the address of the memory location of
3565 the first vector of the data reference.
3566 2. If new_stmt_list is not NULL_TREE after return then the caller must insert
3567 these statement(s) which define the returned SSA_NAME.
3568
3569 FORNOW: We are only handling array accesses with step 1. */
3570
3571 tree
3572 vect_create_addr_base_for_vector_ref (gimple stmt,
3573 gimple_seq *new_stmt_list,
3574 tree offset,
3575 struct loop *loop)
3576 {
3577 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3578 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3579 tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr));
3580 tree base_name;
3581 tree data_ref_base_var;
3582 tree vec_stmt;
3583 tree addr_base, addr_expr;
3584 tree dest;
3585 gimple_seq seq = NULL;
3586 tree base_offset = unshare_expr (DR_OFFSET (dr));
3587 tree init = unshare_expr (DR_INIT (dr));
3588 tree vect_ptr_type;
3589 tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr)));
3590 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3591 tree base;
3592
3593 if (loop_vinfo && loop && loop != (gimple_bb (stmt))->loop_father)
3594 {
3595 struct loop *outer_loop = LOOP_VINFO_LOOP (loop_vinfo);
3596
3597 gcc_assert (nested_in_vect_loop_p (outer_loop, stmt));
3598
3599 data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info));
3600 base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info));
3601 init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info));
3602 }
3603
3604 if (loop_vinfo)
3605 base_name = build_fold_indirect_ref (data_ref_base);
3606 else
3607 {
3608 base_offset = ssize_int (0);
3609 init = ssize_int (0);
3610 base_name = build_fold_indirect_ref (unshare_expr (DR_REF (dr)));
3611 }
3612
3613 data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp");
3614 data_ref_base = force_gimple_operand (data_ref_base, &seq, true,
3615 data_ref_base_var);
3616 gimple_seq_add_seq (new_stmt_list, seq);
3617
3618 /* Create base_offset */
3619 base_offset = size_binop (PLUS_EXPR,
3620 fold_convert (sizetype, base_offset),
3621 fold_convert (sizetype, init));
3622 dest = create_tmp_var (sizetype, "base_off");
3623 base_offset = force_gimple_operand (base_offset, &seq, true, dest);
3624 gimple_seq_add_seq (new_stmt_list, seq);
3625
3626 if (offset)
3627 {
3628 tree tmp = create_tmp_var (sizetype, "offset");
3629
3630 offset = fold_build2 (MULT_EXPR, sizetype,
3631 fold_convert (sizetype, offset), step);
3632 base_offset = fold_build2 (PLUS_EXPR, sizetype,
3633 base_offset, offset);
3634 base_offset = force_gimple_operand (base_offset, &seq, false, tmp);
3635 gimple_seq_add_seq (new_stmt_list, seq);
3636 }
3637
3638 /* base + base_offset */
3639 if (loop_vinfo)
3640 addr_base = fold_build_pointer_plus (data_ref_base, base_offset);
3641 else
3642 {
3643 addr_base = build1 (ADDR_EXPR,
3644 build_pointer_type (TREE_TYPE (DR_REF (dr))),
3645 unshare_expr (DR_REF (dr)));
3646 }
3647
3648 vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info));
3649 base = get_base_address (DR_REF (dr));
3650 if (base
3651 && TREE_CODE (base) == MEM_REF)
3652 vect_ptr_type
3653 = build_qualified_type (vect_ptr_type,
3654 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0))));
3655
3656 vec_stmt = fold_convert (vect_ptr_type, addr_base);
3657 addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var,
3658 get_name (base_name));
3659 vec_stmt = force_gimple_operand (vec_stmt, &seq, false, addr_expr);
3660 gimple_seq_add_seq (new_stmt_list, seq);
3661
3662 if (DR_PTR_INFO (dr)
3663 && TREE_CODE (vec_stmt) == SSA_NAME)
3664 {
3665 duplicate_ssa_name_ptr_info (vec_stmt, DR_PTR_INFO (dr));
3666 if (offset)
3667 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (vec_stmt));
3668 }
3669
3670 if (dump_enabled_p ())
3671 {
3672 dump_printf_loc (MSG_NOTE, vect_location, "created ");
3673 dump_generic_expr (MSG_NOTE, TDF_SLIM, vec_stmt);
3674 }
3675
3676 return vec_stmt;
3677 }
3678
3679
3680 /* Function vect_create_data_ref_ptr.
3681
3682 Create a new pointer-to-AGGR_TYPE variable (ap), that points to the first
3683 location accessed in the loop by STMT, along with the def-use update
3684 chain to appropriately advance the pointer through the loop iterations.
3685 Also set aliasing information for the pointer. This pointer is used by
3686 the callers to this function to create a memory reference expression for
3687 vector load/store access.
3688
3689 Input:
3690 1. STMT: a stmt that references memory. Expected to be of the form
3691 GIMPLE_ASSIGN <name, data-ref> or
3692 GIMPLE_ASSIGN <data-ref, name>.
3693 2. AGGR_TYPE: the type of the reference, which should be either a vector
3694 or an array.
3695 3. AT_LOOP: the loop where the vector memref is to be created.
3696 4. OFFSET (optional): an offset to be added to the initial address accessed
3697 by the data-ref in STMT.
3698 5. BSI: location where the new stmts are to be placed if there is no loop
3699 6. ONLY_INIT: indicate if ap is to be updated in the loop, or remain
3700 pointing to the initial address.
3701
3702 Output:
3703 1. Declare a new ptr to vector_type, and have it point to the base of the
3704 data reference (initial addressed accessed by the data reference).
3705 For example, for vector of type V8HI, the following code is generated:
3706
3707 v8hi *ap;
3708 ap = (v8hi *)initial_address;
3709
3710 if OFFSET is not supplied:
3711 initial_address = &a[init];
3712 if OFFSET is supplied:
3713 initial_address = &a[init + OFFSET];
3714
3715 Return the initial_address in INITIAL_ADDRESS.
3716
3717 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also
3718 update the pointer in each iteration of the loop.
3719
3720 Return the increment stmt that updates the pointer in PTR_INCR.
3721
3722 3. Set INV_P to true if the access pattern of the data reference in the
3723 vectorized loop is invariant. Set it to false otherwise.
3724
3725 4. Return the pointer. */
3726
3727 tree
3728 vect_create_data_ref_ptr (gimple stmt, tree aggr_type, struct loop *at_loop,
3729 tree offset, tree *initial_address,
3730 gimple_stmt_iterator *gsi, gimple *ptr_incr,
3731 bool only_init, bool *inv_p)
3732 {
3733 tree base_name;
3734 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3735 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3736 struct loop *loop = NULL;
3737 bool nested_in_vect_loop = false;
3738 struct loop *containing_loop = NULL;
3739 tree aggr_ptr_type;
3740 tree aggr_ptr;
3741 tree new_temp;
3742 gimple vec_stmt;
3743 gimple_seq new_stmt_list = NULL;
3744 edge pe = NULL;
3745 basic_block new_bb;
3746 tree aggr_ptr_init;
3747 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
3748 tree aptr;
3749 gimple_stmt_iterator incr_gsi;
3750 bool insert_after;
3751 bool negative;
3752 tree indx_before_incr, indx_after_incr;
3753 gimple incr;
3754 tree step;
3755 bb_vec_info bb_vinfo = STMT_VINFO_BB_VINFO (stmt_info);
3756 tree base;
3757
3758 gcc_assert (TREE_CODE (aggr_type) == ARRAY_TYPE
3759 || TREE_CODE (aggr_type) == VECTOR_TYPE);
3760
3761 if (loop_vinfo)
3762 {
3763 loop = LOOP_VINFO_LOOP (loop_vinfo);
3764 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
3765 containing_loop = (gimple_bb (stmt))->loop_father;
3766 pe = loop_preheader_edge (loop);
3767 }
3768 else
3769 {
3770 gcc_assert (bb_vinfo);
3771 only_init = true;
3772 *ptr_incr = NULL;
3773 }
3774
3775 /* Check the step (evolution) of the load in LOOP, and record
3776 whether it's invariant. */
3777 if (nested_in_vect_loop)
3778 step = STMT_VINFO_DR_STEP (stmt_info);
3779 else
3780 step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info));
3781
3782 if (tree_int_cst_compare (step, size_zero_node) == 0)
3783 *inv_p = true;
3784 else
3785 *inv_p = false;
3786 negative = tree_int_cst_compare (step, size_zero_node) < 0;
3787
3788 /* Create an expression for the first address accessed by this load
3789 in LOOP. */
3790 base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr)));
3791
3792 if (dump_enabled_p ())
3793 {
3794 tree data_ref_base = base_name;
3795 dump_printf_loc (MSG_NOTE, vect_location,
3796 "create %s-pointer variable to type: ",
3797 tree_code_name[(int) TREE_CODE (aggr_type)]);
3798 dump_generic_expr (MSG_NOTE, TDF_SLIM, aggr_type);
3799 if (TREE_CODE (data_ref_base) == VAR_DECL
3800 || TREE_CODE (data_ref_base) == ARRAY_REF)
3801 dump_printf (MSG_NOTE, " vectorizing an array ref: ");
3802 else if (TREE_CODE (data_ref_base) == COMPONENT_REF)
3803 dump_printf (MSG_NOTE, " vectorizing a record based array ref: ");
3804 else if (TREE_CODE (data_ref_base) == SSA_NAME)
3805 dump_printf (MSG_NOTE, " vectorizing a pointer ref: ");
3806 dump_generic_expr (MSG_NOTE, TDF_SLIM, base_name);
3807 }
3808
3809 /* (1) Create the new aggregate-pointer variable. */
3810 aggr_ptr_type = build_pointer_type (aggr_type);
3811 base = get_base_address (DR_REF (dr));
3812 if (base
3813 && TREE_CODE (base) == MEM_REF)
3814 aggr_ptr_type
3815 = build_qualified_type (aggr_ptr_type,
3816 TYPE_QUALS (TREE_TYPE (TREE_OPERAND (base, 0))));
3817 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3818 get_name (base_name));
3819
3820 /* Vector and array types inherit the alias set of their component
3821 type by default so we need to use a ref-all pointer if the data
3822 reference does not conflict with the created aggregated data
3823 reference because it is not addressable. */
3824 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr),
3825 get_alias_set (DR_REF (dr))))
3826 {
3827 aggr_ptr_type
3828 = build_pointer_type_for_mode (aggr_type,
3829 TYPE_MODE (aggr_ptr_type), true);
3830 aggr_ptr = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3831 get_name (base_name));
3832 }
3833
3834 /* Likewise for any of the data references in the stmt group. */
3835 else if (STMT_VINFO_GROUP_SIZE (stmt_info) > 1)
3836 {
3837 gimple orig_stmt = STMT_VINFO_GROUP_FIRST_ELEMENT (stmt_info);
3838 do
3839 {
3840 tree lhs = gimple_assign_lhs (orig_stmt);
3841 if (!alias_sets_conflict_p (get_deref_alias_set (aggr_ptr),
3842 get_alias_set (lhs)))
3843 {
3844 aggr_ptr_type
3845 = build_pointer_type_for_mode (aggr_type,
3846 TYPE_MODE (aggr_ptr_type), true);
3847 aggr_ptr
3848 = vect_get_new_vect_var (aggr_ptr_type, vect_pointer_var,
3849 get_name (base_name));
3850 break;
3851 }
3852
3853 orig_stmt = STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo_for_stmt (orig_stmt));
3854 }
3855 while (orig_stmt);
3856 }
3857
3858 /* Note: If the dataref is in an inner-loop nested in LOOP, and we are
3859 vectorizing LOOP (i.e., outer-loop vectorization), we need to create two
3860 def-use update cycles for the pointer: one relative to the outer-loop
3861 (LOOP), which is what steps (3) and (4) below do. The other is relative
3862 to the inner-loop (which is the inner-most loop containing the dataref),
3863 and this is done be step (5) below.
3864
3865 When vectorizing inner-most loops, the vectorized loop (LOOP) is also the
3866 inner-most loop, and so steps (3),(4) work the same, and step (5) is
3867 redundant. Steps (3),(4) create the following:
3868
3869 vp0 = &base_addr;
3870 LOOP: vp1 = phi(vp0,vp2)
3871 ...
3872 ...
3873 vp2 = vp1 + step
3874 goto LOOP
3875
3876 If there is an inner-loop nested in loop, then step (5) will also be
3877 applied, and an additional update in the inner-loop will be created:
3878
3879 vp0 = &base_addr;
3880 LOOP: vp1 = phi(vp0,vp2)
3881 ...
3882 inner: vp3 = phi(vp1,vp4)
3883 vp4 = vp3 + inner_step
3884 if () goto inner
3885 ...
3886 vp2 = vp1 + step
3887 if () goto LOOP */
3888
3889 /* (2) Calculate the initial address of the aggregate-pointer, and set
3890 the aggregate-pointer to point to it before the loop. */
3891
3892 /* Create: (&(base[init_val+offset]) in the loop preheader. */
3893
3894 new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list,
3895 offset, loop);
3896 if (new_stmt_list)
3897 {
3898 if (pe)
3899 {
3900 new_bb = gsi_insert_seq_on_edge_immediate (pe, new_stmt_list);
3901 gcc_assert (!new_bb);
3902 }
3903 else
3904 gsi_insert_seq_before (gsi, new_stmt_list, GSI_SAME_STMT);
3905 }
3906
3907 *initial_address = new_temp;
3908
3909 /* Create: p = (aggr_type *) initial_base */
3910 if (TREE_CODE (new_temp) != SSA_NAME
3911 || !useless_type_conversion_p (aggr_ptr_type, TREE_TYPE (new_temp)))
3912 {
3913 vec_stmt = gimple_build_assign (aggr_ptr,
3914 fold_convert (aggr_ptr_type, new_temp));
3915 aggr_ptr_init = make_ssa_name (aggr_ptr, vec_stmt);
3916 /* Copy the points-to information if it exists. */
3917 if (DR_PTR_INFO (dr))
3918 duplicate_ssa_name_ptr_info (aggr_ptr_init, DR_PTR_INFO (dr));
3919 gimple_assign_set_lhs (vec_stmt, aggr_ptr_init);
3920 if (pe)
3921 {
3922 new_bb = gsi_insert_on_edge_immediate (pe, vec_stmt);
3923 gcc_assert (!new_bb);
3924 }
3925 else
3926 gsi_insert_before (gsi, vec_stmt, GSI_SAME_STMT);
3927 }
3928 else
3929 aggr_ptr_init = new_temp;
3930
3931 /* (3) Handle the updating of the aggregate-pointer inside the loop.
3932 This is needed when ONLY_INIT is false, and also when AT_LOOP is the
3933 inner-loop nested in LOOP (during outer-loop vectorization). */
3934
3935 /* No update in loop is required. */
3936 if (only_init && (!loop_vinfo || at_loop == loop))
3937 aptr = aggr_ptr_init;
3938 else
3939 {
3940 /* The step of the aggregate pointer is the type size. */
3941 tree step = TYPE_SIZE_UNIT (aggr_type);
3942 /* One exception to the above is when the scalar step of the load in
3943 LOOP is zero. In this case the step here is also zero. */
3944 if (*inv_p)
3945 step = size_zero_node;
3946 else if (negative)
3947 step = fold_build1 (NEGATE_EXPR, TREE_TYPE (step), step);
3948
3949 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
3950
3951 create_iv (aggr_ptr_init,
3952 fold_convert (aggr_ptr_type, step),
3953 aggr_ptr, loop, &incr_gsi, insert_after,
3954 &indx_before_incr, &indx_after_incr);
3955 incr = gsi_stmt (incr_gsi);
3956 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3957
3958 /* Copy the points-to information if it exists. */
3959 if (DR_PTR_INFO (dr))
3960 {
3961 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3962 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3963 }
3964 if (ptr_incr)
3965 *ptr_incr = incr;
3966
3967 aptr = indx_before_incr;
3968 }
3969
3970 if (!nested_in_vect_loop || only_init)
3971 return aptr;
3972
3973
3974 /* (4) Handle the updating of the aggregate-pointer inside the inner-loop
3975 nested in LOOP, if exists. */
3976
3977 gcc_assert (nested_in_vect_loop);
3978 if (!only_init)
3979 {
3980 standard_iv_increment_position (containing_loop, &incr_gsi,
3981 &insert_after);
3982 create_iv (aptr, fold_convert (aggr_ptr_type, DR_STEP (dr)), aggr_ptr,
3983 containing_loop, &incr_gsi, insert_after, &indx_before_incr,
3984 &indx_after_incr);
3985 incr = gsi_stmt (incr_gsi);
3986 set_vinfo_for_stmt (incr, new_stmt_vec_info (incr, loop_vinfo, NULL));
3987
3988 /* Copy the points-to information if it exists. */
3989 if (DR_PTR_INFO (dr))
3990 {
3991 duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr));
3992 duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr));
3993 }
3994 if (ptr_incr)
3995 *ptr_incr = incr;
3996
3997 return indx_before_incr;
3998 }
3999 else
4000 gcc_unreachable ();
4001 }
4002
4003
4004 /* Function bump_vector_ptr
4005
4006 Increment a pointer (to a vector type) by vector-size. If requested,
4007 i.e. if PTR-INCR is given, then also connect the new increment stmt
4008 to the existing def-use update-chain of the pointer, by modifying
4009 the PTR_INCR as illustrated below:
4010
4011 The pointer def-use update-chain before this function:
4012 DATAREF_PTR = phi (p_0, p_2)
4013 ....
4014 PTR_INCR: p_2 = DATAREF_PTR + step
4015
4016 The pointer def-use update-chain after this function:
4017 DATAREF_PTR = phi (p_0, p_2)
4018 ....
4019 NEW_DATAREF_PTR = DATAREF_PTR + BUMP
4020 ....
4021 PTR_INCR: p_2 = NEW_DATAREF_PTR + step
4022
4023 Input:
4024 DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated
4025 in the loop.
4026 PTR_INCR - optional. The stmt that updates the pointer in each iteration of
4027 the loop. The increment amount across iterations is expected
4028 to be vector_size.
4029 BSI - location where the new update stmt is to be placed.
4030 STMT - the original scalar memory-access stmt that is being vectorized.
4031 BUMP - optional. The offset by which to bump the pointer. If not given,
4032 the offset is assumed to be vector_size.
4033
4034 Output: Return NEW_DATAREF_PTR as illustrated above.
4035
4036 */
4037
4038 tree
4039 bump_vector_ptr (tree dataref_ptr, gimple ptr_incr, gimple_stmt_iterator *gsi,
4040 gimple stmt, tree bump)
4041 {
4042 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4043 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4044 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4045 tree update = TYPE_SIZE_UNIT (vectype);
4046 gimple incr_stmt;
4047 ssa_op_iter iter;
4048 use_operand_p use_p;
4049 tree new_dataref_ptr;
4050
4051 if (bump)
4052 update = bump;
4053
4054 new_dataref_ptr = copy_ssa_name (dataref_ptr, NULL);
4055 incr_stmt = gimple_build_assign_with_ops (POINTER_PLUS_EXPR, new_dataref_ptr,
4056 dataref_ptr, update);
4057 vect_finish_stmt_generation (stmt, incr_stmt, gsi);
4058
4059 /* Copy the points-to information if it exists. */
4060 if (DR_PTR_INFO (dr))
4061 {
4062 duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr));
4063 mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (new_dataref_ptr));
4064 }
4065
4066 if (!ptr_incr)
4067 return new_dataref_ptr;
4068
4069 /* Update the vector-pointer's cross-iteration increment. */
4070 FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE)
4071 {
4072 tree use = USE_FROM_PTR (use_p);
4073
4074 if (use == dataref_ptr)
4075 SET_USE (use_p, new_dataref_ptr);
4076 else
4077 gcc_assert (tree_int_cst_compare (use, update) == 0);
4078 }
4079
4080 return new_dataref_ptr;
4081 }
4082
4083
4084 /* Function vect_create_destination_var.
4085
4086 Create a new temporary of type VECTYPE. */
4087
4088 tree
4089 vect_create_destination_var (tree scalar_dest, tree vectype)
4090 {
4091 tree vec_dest;
4092 const char *new_name;
4093 tree type;
4094 enum vect_var_kind kind;
4095
4096 kind = vectype ? vect_simple_var : vect_scalar_var;
4097 type = vectype ? vectype : TREE_TYPE (scalar_dest);
4098
4099 gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME);
4100
4101 new_name = get_name (scalar_dest);
4102 if (!new_name)
4103 new_name = "var_";
4104 vec_dest = vect_get_new_vect_var (type, kind, new_name);
4105
4106 return vec_dest;
4107 }
4108
4109 /* Function vect_grouped_store_supported.
4110
4111 Returns TRUE if interleave high and interleave low permutations
4112 are supported, and FALSE otherwise. */
4113
4114 bool
4115 vect_grouped_store_supported (tree vectype, unsigned HOST_WIDE_INT count)
4116 {
4117 enum machine_mode mode = TYPE_MODE (vectype);
4118
4119 /* vect_permute_store_chain requires the group size to be a power of two. */
4120 if (exact_log2 (count) == -1)
4121 {
4122 if (dump_enabled_p ())
4123 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4124 "the size of the group of accesses"
4125 " is not a power of 2");
4126 return false;
4127 }
4128
4129 /* Check that the permutation is supported. */
4130 if (VECTOR_MODE_P (mode))
4131 {
4132 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4133 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4134 for (i = 0; i < nelt / 2; i++)
4135 {
4136 sel[i * 2] = i;
4137 sel[i * 2 + 1] = i + nelt;
4138 }
4139 if (can_vec_perm_p (mode, false, sel))
4140 {
4141 for (i = 0; i < nelt; i++)
4142 sel[i] += nelt / 2;
4143 if (can_vec_perm_p (mode, false, sel))
4144 return true;
4145 }
4146 }
4147
4148 if (dump_enabled_p ())
4149 dump_printf (MSG_MISSED_OPTIMIZATION,
4150 "interleave op not supported by target.");
4151 return false;
4152 }
4153
4154
4155 /* Return TRUE if vec_store_lanes is available for COUNT vectors of
4156 type VECTYPE. */
4157
4158 bool
4159 vect_store_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4160 {
4161 return vect_lanes_optab_supported_p ("vec_store_lanes",
4162 vec_store_lanes_optab,
4163 vectype, count);
4164 }
4165
4166
4167 /* Function vect_permute_store_chain.
4168
4169 Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be
4170 a power of 2, generate interleave_high/low stmts to reorder the data
4171 correctly for the stores. Return the final references for stores in
4172 RESULT_CHAIN.
4173
4174 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4175 The input is 4 vectors each containing 8 elements. We assign a number to
4176 each element, the input sequence is:
4177
4178 1st vec: 0 1 2 3 4 5 6 7
4179 2nd vec: 8 9 10 11 12 13 14 15
4180 3rd vec: 16 17 18 19 20 21 22 23
4181 4th vec: 24 25 26 27 28 29 30 31
4182
4183 The output sequence should be:
4184
4185 1st vec: 0 8 16 24 1 9 17 25
4186 2nd vec: 2 10 18 26 3 11 19 27
4187 3rd vec: 4 12 20 28 5 13 21 30
4188 4th vec: 6 14 22 30 7 15 23 31
4189
4190 i.e., we interleave the contents of the four vectors in their order.
4191
4192 We use interleave_high/low instructions to create such output. The input of
4193 each interleave_high/low operation is two vectors:
4194 1st vec 2nd vec
4195 0 1 2 3 4 5 6 7
4196 the even elements of the result vector are obtained left-to-right from the
4197 high/low elements of the first vector. The odd elements of the result are
4198 obtained left-to-right from the high/low elements of the second vector.
4199 The output of interleave_high will be: 0 4 1 5
4200 and of interleave_low: 2 6 3 7
4201
4202
4203 The permutation is done in log LENGTH stages. In each stage interleave_high
4204 and interleave_low stmts are created for each pair of vectors in DR_CHAIN,
4205 where the first argument is taken from the first half of DR_CHAIN and the
4206 second argument from it's second half.
4207 In our example,
4208
4209 I1: interleave_high (1st vec, 3rd vec)
4210 I2: interleave_low (1st vec, 3rd vec)
4211 I3: interleave_high (2nd vec, 4th vec)
4212 I4: interleave_low (2nd vec, 4th vec)
4213
4214 The output for the first stage is:
4215
4216 I1: 0 16 1 17 2 18 3 19
4217 I2: 4 20 5 21 6 22 7 23
4218 I3: 8 24 9 25 10 26 11 27
4219 I4: 12 28 13 29 14 30 15 31
4220
4221 The output of the second stage, i.e. the final result is:
4222
4223 I1: 0 8 16 24 1 9 17 25
4224 I2: 2 10 18 26 3 11 19 27
4225 I3: 4 12 20 28 5 13 21 30
4226 I4: 6 14 22 30 7 15 23 31. */
4227
4228 void
4229 vect_permute_store_chain (VEC(tree,heap) *dr_chain,
4230 unsigned int length,
4231 gimple stmt,
4232 gimple_stmt_iterator *gsi,
4233 VEC(tree,heap) **result_chain)
4234 {
4235 tree vect1, vect2, high, low;
4236 gimple perm_stmt;
4237 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4238 tree perm_mask_low, perm_mask_high;
4239 unsigned int i, n;
4240 unsigned int j, nelt = TYPE_VECTOR_SUBPARTS (vectype);
4241 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4242
4243 *result_chain = VEC_copy (tree, heap, dr_chain);
4244
4245 for (i = 0, n = nelt / 2; i < n; i++)
4246 {
4247 sel[i * 2] = i;
4248 sel[i * 2 + 1] = i + nelt;
4249 }
4250 perm_mask_high = vect_gen_perm_mask (vectype, sel);
4251 gcc_assert (perm_mask_high != NULL);
4252
4253 for (i = 0; i < nelt; i++)
4254 sel[i] += nelt / 2;
4255 perm_mask_low = vect_gen_perm_mask (vectype, sel);
4256 gcc_assert (perm_mask_low != NULL);
4257
4258 for (i = 0, n = exact_log2 (length); i < n; i++)
4259 {
4260 for (j = 0; j < length/2; j++)
4261 {
4262 vect1 = VEC_index (tree, dr_chain, j);
4263 vect2 = VEC_index (tree, dr_chain, j+length/2);
4264
4265 /* Create interleaving stmt:
4266 high = VEC_PERM_EXPR <vect1, vect2, {0, nelt, 1, nelt+1, ...}> */
4267 high = make_temp_ssa_name (vectype, NULL, "vect_inter_high");
4268 perm_stmt
4269 = gimple_build_assign_with_ops (VEC_PERM_EXPR, high,
4270 vect1, vect2, perm_mask_high);
4271 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4272 VEC_replace (tree, *result_chain, 2*j, high);
4273
4274 /* Create interleaving stmt:
4275 low = VEC_PERM_EXPR <vect1, vect2, {nelt/2, nelt*3/2, nelt/2+1,
4276 nelt*3/2+1, ...}> */
4277 low = make_temp_ssa_name (vectype, NULL, "vect_inter_low");
4278 perm_stmt
4279 = gimple_build_assign_with_ops (VEC_PERM_EXPR, low,
4280 vect1, vect2, perm_mask_low);
4281 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4282 VEC_replace (tree, *result_chain, 2*j+1, low);
4283 }
4284 dr_chain = VEC_copy (tree, heap, *result_chain);
4285 }
4286 }
4287
4288 /* Function vect_setup_realignment
4289
4290 This function is called when vectorizing an unaligned load using
4291 the dr_explicit_realign[_optimized] scheme.
4292 This function generates the following code at the loop prolog:
4293
4294 p = initial_addr;
4295 x msq_init = *(floor(p)); # prolog load
4296 realignment_token = call target_builtin;
4297 loop:
4298 x msq = phi (msq_init, ---)
4299
4300 The stmts marked with x are generated only for the case of
4301 dr_explicit_realign_optimized.
4302
4303 The code above sets up a new (vector) pointer, pointing to the first
4304 location accessed by STMT, and a "floor-aligned" load using that pointer.
4305 It also generates code to compute the "realignment-token" (if the relevant
4306 target hook was defined), and creates a phi-node at the loop-header bb
4307 whose arguments are the result of the prolog-load (created by this
4308 function) and the result of a load that takes place in the loop (to be
4309 created by the caller to this function).
4310
4311 For the case of dr_explicit_realign_optimized:
4312 The caller to this function uses the phi-result (msq) to create the
4313 realignment code inside the loop, and sets up the missing phi argument,
4314 as follows:
4315 loop:
4316 msq = phi (msq_init, lsq)
4317 lsq = *(floor(p')); # load in loop
4318 result = realign_load (msq, lsq, realignment_token);
4319
4320 For the case of dr_explicit_realign:
4321 loop:
4322 msq = *(floor(p)); # load in loop
4323 p' = p + (VS-1);
4324 lsq = *(floor(p')); # load in loop
4325 result = realign_load (msq, lsq, realignment_token);
4326
4327 Input:
4328 STMT - (scalar) load stmt to be vectorized. This load accesses
4329 a memory location that may be unaligned.
4330 BSI - place where new code is to be inserted.
4331 ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes
4332 is used.
4333
4334 Output:
4335 REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load
4336 target hook, if defined.
4337 Return value - the result of the loop-header phi node. */
4338
4339 tree
4340 vect_setup_realignment (gimple stmt, gimple_stmt_iterator *gsi,
4341 tree *realignment_token,
4342 enum dr_alignment_support alignment_support_scheme,
4343 tree init_addr,
4344 struct loop **at_loop)
4345 {
4346 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4347 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4348 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4349 struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info);
4350 struct loop *loop = NULL;
4351 edge pe = NULL;
4352 tree scalar_dest = gimple_assign_lhs (stmt);
4353 tree vec_dest;
4354 gimple inc;
4355 tree ptr;
4356 tree data_ref;
4357 gimple new_stmt;
4358 basic_block new_bb;
4359 tree msq_init = NULL_TREE;
4360 tree new_temp;
4361 gimple phi_stmt;
4362 tree msq = NULL_TREE;
4363 gimple_seq stmts = NULL;
4364 bool inv_p;
4365 bool compute_in_loop = false;
4366 bool nested_in_vect_loop = false;
4367 struct loop *containing_loop = (gimple_bb (stmt))->loop_father;
4368 struct loop *loop_for_initial_load = NULL;
4369
4370 if (loop_vinfo)
4371 {
4372 loop = LOOP_VINFO_LOOP (loop_vinfo);
4373 nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt);
4374 }
4375
4376 gcc_assert (alignment_support_scheme == dr_explicit_realign
4377 || alignment_support_scheme == dr_explicit_realign_optimized);
4378
4379 /* We need to generate three things:
4380 1. the misalignment computation
4381 2. the extra vector load (for the optimized realignment scheme).
4382 3. the phi node for the two vectors from which the realignment is
4383 done (for the optimized realignment scheme). */
4384
4385 /* 1. Determine where to generate the misalignment computation.
4386
4387 If INIT_ADDR is NULL_TREE, this indicates that the misalignment
4388 calculation will be generated by this function, outside the loop (in the
4389 preheader). Otherwise, INIT_ADDR had already been computed for us by the
4390 caller, inside the loop.
4391
4392 Background: If the misalignment remains fixed throughout the iterations of
4393 the loop, then both realignment schemes are applicable, and also the
4394 misalignment computation can be done outside LOOP. This is because we are
4395 vectorizing LOOP, and so the memory accesses in LOOP advance in steps that
4396 are a multiple of VS (the Vector Size), and therefore the misalignment in
4397 different vectorized LOOP iterations is always the same.
4398 The problem arises only if the memory access is in an inner-loop nested
4399 inside LOOP, which is now being vectorized using outer-loop vectorization.
4400 This is the only case when the misalignment of the memory access may not
4401 remain fixed throughout the iterations of the inner-loop (as explained in
4402 detail in vect_supportable_dr_alignment). In this case, not only is the
4403 optimized realignment scheme not applicable, but also the misalignment
4404 computation (and generation of the realignment token that is passed to
4405 REALIGN_LOAD) have to be done inside the loop.
4406
4407 In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode
4408 or not, which in turn determines if the misalignment is computed inside
4409 the inner-loop, or outside LOOP. */
4410
4411 if (init_addr != NULL_TREE || !loop_vinfo)
4412 {
4413 compute_in_loop = true;
4414 gcc_assert (alignment_support_scheme == dr_explicit_realign);
4415 }
4416
4417
4418 /* 2. Determine where to generate the extra vector load.
4419
4420 For the optimized realignment scheme, instead of generating two vector
4421 loads in each iteration, we generate a single extra vector load in the
4422 preheader of the loop, and in each iteration reuse the result of the
4423 vector load from the previous iteration. In case the memory access is in
4424 an inner-loop nested inside LOOP, which is now being vectorized using
4425 outer-loop vectorization, we need to determine whether this initial vector
4426 load should be generated at the preheader of the inner-loop, or can be
4427 generated at the preheader of LOOP. If the memory access has no evolution
4428 in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has
4429 to be generated inside LOOP (in the preheader of the inner-loop). */
4430
4431 if (nested_in_vect_loop)
4432 {
4433 tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info);
4434 bool invariant_in_outerloop =
4435 (tree_int_cst_compare (outerloop_step, size_zero_node) == 0);
4436 loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner);
4437 }
4438 else
4439 loop_for_initial_load = loop;
4440 if (at_loop)
4441 *at_loop = loop_for_initial_load;
4442
4443 if (loop_for_initial_load)
4444 pe = loop_preheader_edge (loop_for_initial_load);
4445
4446 /* 3. For the case of the optimized realignment, create the first vector
4447 load at the loop preheader. */
4448
4449 if (alignment_support_scheme == dr_explicit_realign_optimized)
4450 {
4451 /* Create msq_init = *(floor(p1)) in the loop preheader */
4452
4453 gcc_assert (!compute_in_loop);
4454 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4455 ptr = vect_create_data_ref_ptr (stmt, vectype, loop_for_initial_load,
4456 NULL_TREE, &init_addr, NULL, &inc,
4457 true, &inv_p);
4458 new_temp = copy_ssa_name (ptr, NULL);
4459 new_stmt = gimple_build_assign_with_ops
4460 (BIT_AND_EXPR, new_temp, ptr,
4461 build_int_cst (TREE_TYPE (ptr),
4462 -(HOST_WIDE_INT)TYPE_ALIGN_UNIT (vectype)));
4463 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4464 gcc_assert (!new_bb);
4465 data_ref
4466 = build2 (MEM_REF, TREE_TYPE (vec_dest), new_temp,
4467 build_int_cst (reference_alias_ptr_type (DR_REF (dr)), 0));
4468 new_stmt = gimple_build_assign (vec_dest, data_ref);
4469 new_temp = make_ssa_name (vec_dest, new_stmt);
4470 gimple_assign_set_lhs (new_stmt, new_temp);
4471 if (pe)
4472 {
4473 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4474 gcc_assert (!new_bb);
4475 }
4476 else
4477 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4478
4479 msq_init = gimple_assign_lhs (new_stmt);
4480 }
4481
4482 /* 4. Create realignment token using a target builtin, if available.
4483 It is done either inside the containing loop, or before LOOP (as
4484 determined above). */
4485
4486 if (targetm.vectorize.builtin_mask_for_load)
4487 {
4488 tree builtin_decl;
4489
4490 /* Compute INIT_ADDR - the initial addressed accessed by this memref. */
4491 if (!init_addr)
4492 {
4493 /* Generate the INIT_ADDR computation outside LOOP. */
4494 init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts,
4495 NULL_TREE, loop);
4496 if (loop)
4497 {
4498 pe = loop_preheader_edge (loop);
4499 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
4500 gcc_assert (!new_bb);
4501 }
4502 else
4503 gsi_insert_seq_before (gsi, stmts, GSI_SAME_STMT);
4504 }
4505
4506 builtin_decl = targetm.vectorize.builtin_mask_for_load ();
4507 new_stmt = gimple_build_call (builtin_decl, 1, init_addr);
4508 vec_dest =
4509 vect_create_destination_var (scalar_dest,
4510 gimple_call_return_type (new_stmt));
4511 new_temp = make_ssa_name (vec_dest, new_stmt);
4512 gimple_call_set_lhs (new_stmt, new_temp);
4513
4514 if (compute_in_loop)
4515 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
4516 else
4517 {
4518 /* Generate the misalignment computation outside LOOP. */
4519 pe = loop_preheader_edge (loop);
4520 new_bb = gsi_insert_on_edge_immediate (pe, new_stmt);
4521 gcc_assert (!new_bb);
4522 }
4523
4524 *realignment_token = gimple_call_lhs (new_stmt);
4525
4526 /* The result of the CALL_EXPR to this builtin is determined from
4527 the value of the parameter and no global variables are touched
4528 which makes the builtin a "const" function. Requiring the
4529 builtin to have the "const" attribute makes it unnecessary
4530 to call mark_call_clobbered. */
4531 gcc_assert (TREE_READONLY (builtin_decl));
4532 }
4533
4534 if (alignment_support_scheme == dr_explicit_realign)
4535 return msq;
4536
4537 gcc_assert (!compute_in_loop);
4538 gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized);
4539
4540
4541 /* 5. Create msq = phi <msq_init, lsq> in loop */
4542
4543 pe = loop_preheader_edge (containing_loop);
4544 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4545 msq = make_ssa_name (vec_dest, NULL);
4546 phi_stmt = create_phi_node (msq, containing_loop->header);
4547 add_phi_arg (phi_stmt, msq_init, pe, UNKNOWN_LOCATION);
4548
4549 return msq;
4550 }
4551
4552
4553 /* Function vect_grouped_load_supported.
4554
4555 Returns TRUE if even and odd permutations are supported,
4556 and FALSE otherwise. */
4557
4558 bool
4559 vect_grouped_load_supported (tree vectype, unsigned HOST_WIDE_INT count)
4560 {
4561 enum machine_mode mode = TYPE_MODE (vectype);
4562
4563 /* vect_permute_load_chain requires the group size to be a power of two. */
4564 if (exact_log2 (count) == -1)
4565 {
4566 if (dump_enabled_p ())
4567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4568 "the size of the group of accesses"
4569 " is not a power of 2");
4570 return false;
4571 }
4572
4573 /* Check that the permutation is supported. */
4574 if (VECTOR_MODE_P (mode))
4575 {
4576 unsigned int i, nelt = GET_MODE_NUNITS (mode);
4577 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4578
4579 for (i = 0; i < nelt; i++)
4580 sel[i] = i * 2;
4581 if (can_vec_perm_p (mode, false, sel))
4582 {
4583 for (i = 0; i < nelt; i++)
4584 sel[i] = i * 2 + 1;
4585 if (can_vec_perm_p (mode, false, sel))
4586 return true;
4587 }
4588 }
4589
4590 if (dump_enabled_p ())
4591 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4592 "extract even/odd not supported by target");
4593 return false;
4594 }
4595
4596 /* Return TRUE if vec_load_lanes is available for COUNT vectors of
4597 type VECTYPE. */
4598
4599 bool
4600 vect_load_lanes_supported (tree vectype, unsigned HOST_WIDE_INT count)
4601 {
4602 return vect_lanes_optab_supported_p ("vec_load_lanes",
4603 vec_load_lanes_optab,
4604 vectype, count);
4605 }
4606
4607 /* Function vect_permute_load_chain.
4608
4609 Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be
4610 a power of 2, generate extract_even/odd stmts to reorder the input data
4611 correctly. Return the final references for loads in RESULT_CHAIN.
4612
4613 E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8.
4614 The input is 4 vectors each containing 8 elements. We assign a number to each
4615 element, the input sequence is:
4616
4617 1st vec: 0 1 2 3 4 5 6 7
4618 2nd vec: 8 9 10 11 12 13 14 15
4619 3rd vec: 16 17 18 19 20 21 22 23
4620 4th vec: 24 25 26 27 28 29 30 31
4621
4622 The output sequence should be:
4623
4624 1st vec: 0 4 8 12 16 20 24 28
4625 2nd vec: 1 5 9 13 17 21 25 29
4626 3rd vec: 2 6 10 14 18 22 26 30
4627 4th vec: 3 7 11 15 19 23 27 31
4628
4629 i.e., the first output vector should contain the first elements of each
4630 interleaving group, etc.
4631
4632 We use extract_even/odd instructions to create such output. The input of
4633 each extract_even/odd operation is two vectors
4634 1st vec 2nd vec
4635 0 1 2 3 4 5 6 7
4636
4637 and the output is the vector of extracted even/odd elements. The output of
4638 extract_even will be: 0 2 4 6
4639 and of extract_odd: 1 3 5 7
4640
4641
4642 The permutation is done in log LENGTH stages. In each stage extract_even
4643 and extract_odd stmts are created for each pair of vectors in DR_CHAIN in
4644 their order. In our example,
4645
4646 E1: extract_even (1st vec, 2nd vec)
4647 E2: extract_odd (1st vec, 2nd vec)
4648 E3: extract_even (3rd vec, 4th vec)
4649 E4: extract_odd (3rd vec, 4th vec)
4650
4651 The output for the first stage will be:
4652
4653 E1: 0 2 4 6 8 10 12 14
4654 E2: 1 3 5 7 9 11 13 15
4655 E3: 16 18 20 22 24 26 28 30
4656 E4: 17 19 21 23 25 27 29 31
4657
4658 In order to proceed and create the correct sequence for the next stage (or
4659 for the correct output, if the second stage is the last one, as in our
4660 example), we first put the output of extract_even operation and then the
4661 output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN).
4662 The input for the second stage is:
4663
4664 1st vec (E1): 0 2 4 6 8 10 12 14
4665 2nd vec (E3): 16 18 20 22 24 26 28 30
4666 3rd vec (E2): 1 3 5 7 9 11 13 15
4667 4th vec (E4): 17 19 21 23 25 27 29 31
4668
4669 The output of the second stage:
4670
4671 E1: 0 4 8 12 16 20 24 28
4672 E2: 2 6 10 14 18 22 26 30
4673 E3: 1 5 9 13 17 21 25 29
4674 E4: 3 7 11 15 19 23 27 31
4675
4676 And RESULT_CHAIN after reordering:
4677
4678 1st vec (E1): 0 4 8 12 16 20 24 28
4679 2nd vec (E3): 1 5 9 13 17 21 25 29
4680 3rd vec (E2): 2 6 10 14 18 22 26 30
4681 4th vec (E4): 3 7 11 15 19 23 27 31. */
4682
4683 static void
4684 vect_permute_load_chain (VEC(tree,heap) *dr_chain,
4685 unsigned int length,
4686 gimple stmt,
4687 gimple_stmt_iterator *gsi,
4688 VEC(tree,heap) **result_chain)
4689 {
4690 tree data_ref, first_vect, second_vect;
4691 tree perm_mask_even, perm_mask_odd;
4692 gimple perm_stmt;
4693 tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt));
4694 unsigned int i, j, log_length = exact_log2 (length);
4695 unsigned nelt = TYPE_VECTOR_SUBPARTS (vectype);
4696 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
4697
4698 *result_chain = VEC_copy (tree, heap, dr_chain);
4699
4700 for (i = 0; i < nelt; ++i)
4701 sel[i] = i * 2;
4702 perm_mask_even = vect_gen_perm_mask (vectype, sel);
4703 gcc_assert (perm_mask_even != NULL);
4704
4705 for (i = 0; i < nelt; ++i)
4706 sel[i] = i * 2 + 1;
4707 perm_mask_odd = vect_gen_perm_mask (vectype, sel);
4708 gcc_assert (perm_mask_odd != NULL);
4709
4710 for (i = 0; i < log_length; i++)
4711 {
4712 for (j = 0; j < length; j += 2)
4713 {
4714 first_vect = VEC_index (tree, dr_chain, j);
4715 second_vect = VEC_index (tree, dr_chain, j+1);
4716
4717 /* data_ref = permute_even (first_data_ref, second_data_ref); */
4718 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_even");
4719 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4720 first_vect, second_vect,
4721 perm_mask_even);
4722 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4723 VEC_replace (tree, *result_chain, j/2, data_ref);
4724
4725 /* data_ref = permute_odd (first_data_ref, second_data_ref); */
4726 data_ref = make_temp_ssa_name (vectype, NULL, "vect_perm_odd");
4727 perm_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR, data_ref,
4728 first_vect, second_vect,
4729 perm_mask_odd);
4730 vect_finish_stmt_generation (stmt, perm_stmt, gsi);
4731 VEC_replace (tree, *result_chain, j/2+length/2, data_ref);
4732 }
4733 dr_chain = VEC_copy (tree, heap, *result_chain);
4734 }
4735 }
4736
4737
4738 /* Function vect_transform_grouped_load.
4739
4740 Given a chain of input interleaved data-refs (in DR_CHAIN), build statements
4741 to perform their permutation and ascribe the result vectorized statements to
4742 the scalar statements.
4743 */
4744
4745 void
4746 vect_transform_grouped_load (gimple stmt, VEC(tree,heap) *dr_chain, int size,
4747 gimple_stmt_iterator *gsi)
4748 {
4749 VEC(tree,heap) *result_chain = NULL;
4750
4751 /* DR_CHAIN contains input data-refs that are a part of the interleaving.
4752 RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted
4753 vectors, that are ready for vector computation. */
4754 result_chain = VEC_alloc (tree, heap, size);
4755 vect_permute_load_chain (dr_chain, size, stmt, gsi, &result_chain);
4756 vect_record_grouped_load_vectors (stmt, result_chain);
4757 VEC_free (tree, heap, result_chain);
4758 }
4759
4760 /* RESULT_CHAIN contains the output of a group of grouped loads that were
4761 generated as part of the vectorization of STMT. Assign the statement
4762 for each vector to the associated scalar statement. */
4763
4764 void
4765 vect_record_grouped_load_vectors (gimple stmt, VEC(tree,heap) *result_chain)
4766 {
4767 gimple first_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt));
4768 gimple next_stmt, new_stmt;
4769 unsigned int i, gap_count;
4770 tree tmp_data_ref;
4771
4772 /* Put a permuted data-ref in the VECTORIZED_STMT field.
4773 Since we scan the chain starting from it's first node, their order
4774 corresponds the order of data-refs in RESULT_CHAIN. */
4775 next_stmt = first_stmt;
4776 gap_count = 1;
4777 FOR_EACH_VEC_ELT (tree, result_chain, i, tmp_data_ref)
4778 {
4779 if (!next_stmt)
4780 break;
4781
4782 /* Skip the gaps. Loads created for the gaps will be removed by dead
4783 code elimination pass later. No need to check for the first stmt in
4784 the group, since it always exists.
4785 GROUP_GAP is the number of steps in elements from the previous
4786 access (if there is no gap GROUP_GAP is 1). We skip loads that
4787 correspond to the gaps. */
4788 if (next_stmt != first_stmt
4789 && gap_count < GROUP_GAP (vinfo_for_stmt (next_stmt)))
4790 {
4791 gap_count++;
4792 continue;
4793 }
4794
4795 while (next_stmt)
4796 {
4797 new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref);
4798 /* We assume that if VEC_STMT is not NULL, this is a case of multiple
4799 copies, and we put the new vector statement in the first available
4800 RELATED_STMT. */
4801 if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)))
4802 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt;
4803 else
4804 {
4805 if (!GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4806 {
4807 gimple prev_stmt =
4808 STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt));
4809 gimple rel_stmt =
4810 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt));
4811 while (rel_stmt)
4812 {
4813 prev_stmt = rel_stmt;
4814 rel_stmt =
4815 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt));
4816 }
4817
4818 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) =
4819 new_stmt;
4820 }
4821 }
4822
4823 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
4824 gap_count = 1;
4825 /* If NEXT_STMT accesses the same DR as the previous statement,
4826 put the same TMP_DATA_REF as its vectorized statement; otherwise
4827 get the next data-ref from RESULT_CHAIN. */
4828 if (!next_stmt || !GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt)))
4829 break;
4830 }
4831 }
4832 }
4833
4834 /* Function vect_force_dr_alignment_p.
4835
4836 Returns whether the alignment of a DECL can be forced to be aligned
4837 on ALIGNMENT bit boundary. */
4838
4839 bool
4840 vect_can_force_dr_alignment_p (const_tree decl, unsigned int alignment)
4841 {
4842 if (TREE_CODE (decl) != VAR_DECL)
4843 return false;
4844
4845 /* We cannot change alignment of common or external symbols as another
4846 translation unit may contain a definition with lower alignment.
4847 The rules of common symbol linking mean that the definition
4848 will override the common symbol. */
4849 if (DECL_EXTERNAL (decl)
4850 || DECL_COMMON (decl))
4851 return false;
4852
4853 if (TREE_ASM_WRITTEN (decl))
4854 return false;
4855
4856 /* Do not override the alignment as specified by the ABI when the used
4857 attribute is set. */
4858 if (DECL_PRESERVE_P (decl))
4859 return false;
4860
4861 /* Do not override explicit alignment set by the user when an explicit
4862 section name is also used. This is a common idiom used by many
4863 software projects. */
4864 if (DECL_SECTION_NAME (decl) != NULL_TREE
4865 && !DECL_HAS_IMPLICIT_SECTION_NAME_P (decl))
4866 return false;
4867
4868 if (TREE_STATIC (decl))
4869 return (alignment <= MAX_OFILE_ALIGNMENT);
4870 else
4871 return (alignment <= MAX_STACK_ALIGNMENT);
4872 }
4873
4874
4875 /* Return whether the data reference DR is supported with respect to its
4876 alignment.
4877 If CHECK_ALIGNED_ACCESSES is TRUE, check if the access is supported even
4878 it is aligned, i.e., check if it is possible to vectorize it with different
4879 alignment. */
4880
4881 enum dr_alignment_support
4882 vect_supportable_dr_alignment (struct data_reference *dr,
4883 bool check_aligned_accesses)
4884 {
4885 gimple stmt = DR_STMT (dr);
4886 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4887 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4888 enum machine_mode mode = TYPE_MODE (vectype);
4889 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4890 struct loop *vect_loop = NULL;
4891 bool nested_in_vect_loop = false;
4892
4893 if (aligned_access_p (dr) && !check_aligned_accesses)
4894 return dr_aligned;
4895
4896 if (loop_vinfo)
4897 {
4898 vect_loop = LOOP_VINFO_LOOP (loop_vinfo);
4899 nested_in_vect_loop = nested_in_vect_loop_p (vect_loop, stmt);
4900 }
4901
4902 /* Possibly unaligned access. */
4903
4904 /* We can choose between using the implicit realignment scheme (generating
4905 a misaligned_move stmt) and the explicit realignment scheme (generating
4906 aligned loads with a REALIGN_LOAD). There are two variants to the
4907 explicit realignment scheme: optimized, and unoptimized.
4908 We can optimize the realignment only if the step between consecutive
4909 vector loads is equal to the vector size. Since the vector memory
4910 accesses advance in steps of VS (Vector Size) in the vectorized loop, it
4911 is guaranteed that the misalignment amount remains the same throughout the
4912 execution of the vectorized loop. Therefore, we can create the
4913 "realignment token" (the permutation mask that is passed to REALIGN_LOAD)
4914 at the loop preheader.
4915
4916 However, in the case of outer-loop vectorization, when vectorizing a
4917 memory access in the inner-loop nested within the LOOP that is now being
4918 vectorized, while it is guaranteed that the misalignment of the
4919 vectorized memory access will remain the same in different outer-loop
4920 iterations, it is *not* guaranteed that is will remain the same throughout
4921 the execution of the inner-loop. This is because the inner-loop advances
4922 with the original scalar step (and not in steps of VS). If the inner-loop
4923 step happens to be a multiple of VS, then the misalignment remains fixed
4924 and we can use the optimized realignment scheme. For example:
4925
4926 for (i=0; i<N; i++)
4927 for (j=0; j<M; j++)
4928 s += a[i+j];
4929
4930 When vectorizing the i-loop in the above example, the step between
4931 consecutive vector loads is 1, and so the misalignment does not remain
4932 fixed across the execution of the inner-loop, and the realignment cannot
4933 be optimized (as illustrated in the following pseudo vectorized loop):
4934
4935 for (i=0; i<N; i+=4)
4936 for (j=0; j<M; j++){
4937 vs += vp[i+j]; // misalignment of &vp[i+j] is {0,1,2,3,0,1,2,3,...}
4938 // when j is {0,1,2,3,4,5,6,7,...} respectively.
4939 // (assuming that we start from an aligned address).
4940 }
4941
4942 We therefore have to use the unoptimized realignment scheme:
4943
4944 for (i=0; i<N; i+=4)
4945 for (j=k; j<M; j+=4)
4946 vs += vp[i+j]; // misalignment of &vp[i+j] is always k (assuming
4947 // that the misalignment of the initial address is
4948 // 0).
4949
4950 The loop can then be vectorized as follows:
4951
4952 for (k=0; k<4; k++){
4953 rt = get_realignment_token (&vp[k]);
4954 for (i=0; i<N; i+=4){
4955 v1 = vp[i+k];
4956 for (j=k; j<M; j+=4){
4957 v2 = vp[i+j+VS-1];
4958 va = REALIGN_LOAD <v1,v2,rt>;
4959 vs += va;
4960 v1 = v2;
4961 }
4962 }
4963 } */
4964
4965 if (DR_IS_READ (dr))
4966 {
4967 bool is_packed = false;
4968 tree type = (TREE_TYPE (DR_REF (dr)));
4969
4970 if (optab_handler (vec_realign_load_optab, mode) != CODE_FOR_nothing
4971 && (!targetm.vectorize.builtin_mask_for_load
4972 || targetm.vectorize.builtin_mask_for_load ()))
4973 {
4974 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
4975 if ((nested_in_vect_loop
4976 && (TREE_INT_CST_LOW (DR_STEP (dr))
4977 != GET_MODE_SIZE (TYPE_MODE (vectype))))
4978 || !loop_vinfo)
4979 return dr_explicit_realign;
4980 else
4981 return dr_explicit_realign_optimized;
4982 }
4983 if (!known_alignment_for_access_p (dr))
4984 is_packed = not_size_aligned (DR_REF (dr));
4985
4986 if (targetm.vectorize.
4987 support_vector_misalignment (mode, type,
4988 DR_MISALIGNMENT (dr), is_packed))
4989 /* Can't software pipeline the loads, but can at least do them. */
4990 return dr_unaligned_supported;
4991 }
4992 else
4993 {
4994 bool is_packed = false;
4995 tree type = (TREE_TYPE (DR_REF (dr)));
4996
4997 if (!known_alignment_for_access_p (dr))
4998 is_packed = not_size_aligned (DR_REF (dr));
4999
5000 if (targetm.vectorize.
5001 support_vector_misalignment (mode, type,
5002 DR_MISALIGNMENT (dr), is_packed))
5003 return dr_unaligned_supported;
5004 }
5005
5006 /* Unsupported. */
5007 return dr_unaligned_unsupported;
5008 }