gimple-walk.h: New File.
[gcc.git] / gcc / tree-ssa-loop-prefetch.c
1 /* Array prefetching.
2 Copyright (C) 2005-2013 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "tm_p.h"
26 #include "basic-block.h"
27 #include "tree-pretty-print.h"
28 #include "gimplify.h"
29 #include "gimple-iterator.h"
30 #include "gimple-ssa.h"
31 #include "tree-ssa-loop-ivopts.h"
32 #include "tree-ssa-loop-manip.h"
33 #include "tree-ssa-loop-niter.h"
34 #include "tree-ssa-loop.h"
35 #include "tree-into-ssa.h"
36 #include "cfgloop.h"
37 #include "tree-pass.h"
38 #include "insn-config.h"
39 #include "hashtab.h"
40 #include "tree-chrec.h"
41 #include "tree-scalar-evolution.h"
42 #include "diagnostic-core.h"
43 #include "params.h"
44 #include "langhooks.h"
45 #include "tree-inline.h"
46 #include "tree-data-ref.h"
47
48
49 /* FIXME: Needed for optabs, but this should all be moved to a TBD interface
50 between the GIMPLE and RTL worlds. */
51 #include "expr.h"
52 #include "optabs.h"
53 #include "recog.h"
54
55 /* This pass inserts prefetch instructions to optimize cache usage during
56 accesses to arrays in loops. It processes loops sequentially and:
57
58 1) Gathers all memory references in the single loop.
59 2) For each of the references it decides when it is profitable to prefetch
60 it. To do it, we evaluate the reuse among the accesses, and determines
61 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
62 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
63 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
64 iterations of the loop that are zero modulo PREFETCH_MOD). For example
65 (assuming cache line size is 64 bytes, char has size 1 byte and there
66 is no hardware sequential prefetch):
67
68 char *a;
69 for (i = 0; i < max; i++)
70 {
71 a[255] = ...; (0)
72 a[i] = ...; (1)
73 a[i + 64] = ...; (2)
74 a[16*i] = ...; (3)
75 a[187*i] = ...; (4)
76 a[187*i + 50] = ...; (5)
77 }
78
79 (0) obviously has PREFETCH_BEFORE 1
80 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
81 location 64 iterations before it, and PREFETCH_MOD 64 (since
82 it hits the same cache line otherwise).
83 (2) has PREFETCH_MOD 64
84 (3) has PREFETCH_MOD 4
85 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
86 the cache line accessed by (5) is the same with probability only
87 7/32.
88 (5) has PREFETCH_MOD 1 as well.
89
90 Additionally, we use data dependence analysis to determine for each
91 reference the distance till the first reuse; this information is used
92 to determine the temporality of the issued prefetch instruction.
93
94 3) We determine how much ahead we need to prefetch. The number of
95 iterations needed is time to fetch / time spent in one iteration of
96 the loop. The problem is that we do not know either of these values,
97 so we just make a heuristic guess based on a magic (possibly)
98 target-specific constant and size of the loop.
99
100 4) Determine which of the references we prefetch. We take into account
101 that there is a maximum number of simultaneous prefetches (provided
102 by machine description). We prefetch as many prefetches as possible
103 while still within this bound (starting with those with lowest
104 prefetch_mod, since they are responsible for most of the cache
105 misses).
106
107 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
108 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
109 prefetching nonaccessed memory.
110 TODO -- actually implement peeling.
111
112 6) We actually emit the prefetch instructions. ??? Perhaps emit the
113 prefetch instructions with guards in cases where 5) was not sufficient
114 to satisfy the constraints?
115
116 A cost model is implemented to determine whether or not prefetching is
117 profitable for a given loop. The cost model has three heuristics:
118
119 1. Function trip_count_to_ahead_ratio_too_small_p implements a
120 heuristic that determines whether or not the loop has too few
121 iterations (compared to ahead). Prefetching is not likely to be
122 beneficial if the trip count to ahead ratio is below a certain
123 minimum.
124
125 2. Function mem_ref_count_reasonable_p implements a heuristic that
126 determines whether the given loop has enough CPU ops that can be
127 overlapped with cache missing memory ops. If not, the loop
128 won't benefit from prefetching. In the implementation,
129 prefetching is not considered beneficial if the ratio between
130 the instruction count and the mem ref count is below a certain
131 minimum.
132
133 3. Function insn_to_prefetch_ratio_too_small_p implements a
134 heuristic that disables prefetching in a loop if the prefetching
135 cost is above a certain limit. The relative prefetching cost is
136 estimated by taking the ratio between the prefetch count and the
137 total intruction count (this models the I-cache cost).
138
139 The limits used in these heuristics are defined as parameters with
140 reasonable default values. Machine-specific default values will be
141 added later.
142
143 Some other TODO:
144 -- write and use more general reuse analysis (that could be also used
145 in other cache aimed loop optimizations)
146 -- make it behave sanely together with the prefetches given by user
147 (now we just ignore them; at the very least we should avoid
148 optimizing loops in that user put his own prefetches)
149 -- we assume cache line size alignment of arrays; this could be
150 improved. */
151
152 /* Magic constants follow. These should be replaced by machine specific
153 numbers. */
154
155 /* True if write can be prefetched by a read prefetch. */
156
157 #ifndef WRITE_CAN_USE_READ_PREFETCH
158 #define WRITE_CAN_USE_READ_PREFETCH 1
159 #endif
160
161 /* True if read can be prefetched by a write prefetch. */
162
163 #ifndef READ_CAN_USE_WRITE_PREFETCH
164 #define READ_CAN_USE_WRITE_PREFETCH 0
165 #endif
166
167 /* The size of the block loaded by a single prefetch. Usually, this is
168 the same as cache line size (at the moment, we only consider one level
169 of cache hierarchy). */
170
171 #ifndef PREFETCH_BLOCK
172 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
173 #endif
174
175 /* Do we have a forward hardware sequential prefetching? */
176
177 #ifndef HAVE_FORWARD_PREFETCH
178 #define HAVE_FORWARD_PREFETCH 0
179 #endif
180
181 /* Do we have a backward hardware sequential prefetching? */
182
183 #ifndef HAVE_BACKWARD_PREFETCH
184 #define HAVE_BACKWARD_PREFETCH 0
185 #endif
186
187 /* In some cases we are only able to determine that there is a certain
188 probability that the two accesses hit the same cache line. In this
189 case, we issue the prefetches for both of them if this probability
190 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
191
192 #ifndef ACCEPTABLE_MISS_RATE
193 #define ACCEPTABLE_MISS_RATE 50
194 #endif
195
196 #ifndef HAVE_prefetch
197 #define HAVE_prefetch 0
198 #endif
199
200 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
201 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
202
203 /* We consider a memory access nontemporal if it is not reused sooner than
204 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
205 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
206 so that we use nontemporal prefetches e.g. if single memory location
207 is accessed several times in a single iteration of the loop. */
208 #define NONTEMPORAL_FRACTION 16
209
210 /* In case we have to emit a memory fence instruction after the loop that
211 uses nontemporal stores, this defines the builtin to use. */
212
213 #ifndef FENCE_FOLLOWING_MOVNT
214 #define FENCE_FOLLOWING_MOVNT NULL_TREE
215 #endif
216
217 /* It is not profitable to prefetch when the trip count is not at
218 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
219 For example, in a loop with a prefetch ahead distance of 10,
220 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
221 profitable to prefetch when the trip count is greater or equal to
222 40. In that case, 30 out of the 40 iterations will benefit from
223 prefetching. */
224
225 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
226 #define TRIP_COUNT_TO_AHEAD_RATIO 4
227 #endif
228
229 /* The group of references between that reuse may occur. */
230
231 struct mem_ref_group
232 {
233 tree base; /* Base of the reference. */
234 tree step; /* Step of the reference. */
235 struct mem_ref *refs; /* References in the group. */
236 struct mem_ref_group *next; /* Next group of references. */
237 };
238
239 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
240
241 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
242
243 /* Do not generate a prefetch if the unroll factor is significantly less
244 than what is required by the prefetch. This is to avoid redundant
245 prefetches. For example, when prefetch_mod is 16 and unroll_factor is
246 2, prefetching requires unrolling the loop 16 times, but
247 the loop is actually unrolled twice. In this case (ratio = 8),
248 prefetching is not likely to be beneficial. */
249
250 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
251 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
252 #endif
253
254 /* Some of the prefetch computations have quadratic complexity. We want to
255 avoid huge compile times and, therefore, want to limit the amount of
256 memory references per loop where we consider prefetching. */
257
258 #ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
259 #define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
260 #endif
261
262 /* The memory reference. */
263
264 struct mem_ref
265 {
266 gimple stmt; /* Statement in that the reference appears. */
267 tree mem; /* The reference. */
268 HOST_WIDE_INT delta; /* Constant offset of the reference. */
269 struct mem_ref_group *group; /* The group of references it belongs to. */
270 unsigned HOST_WIDE_INT prefetch_mod;
271 /* Prefetch only each PREFETCH_MOD-th
272 iteration. */
273 unsigned HOST_WIDE_INT prefetch_before;
274 /* Prefetch only first PREFETCH_BEFORE
275 iterations. */
276 unsigned reuse_distance; /* The amount of data accessed before the first
277 reuse of this value. */
278 struct mem_ref *next; /* The next reference in the group. */
279 unsigned write_p : 1; /* Is it a write? */
280 unsigned independent_p : 1; /* True if the reference is independent on
281 all other references inside the loop. */
282 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
283 unsigned storent_p : 1; /* True if we changed the store to a
284 nontemporal one. */
285 };
286
287 /* Dumps information about memory reference */
288 static void
289 dump_mem_details (FILE *file, tree base, tree step,
290 HOST_WIDE_INT delta, bool write_p)
291 {
292 fprintf (file, "(base ");
293 print_generic_expr (file, base, TDF_SLIM);
294 fprintf (file, ", step ");
295 if (cst_and_fits_in_hwi (step))
296 fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
297 else
298 print_generic_expr (file, step, TDF_TREE);
299 fprintf (file, ")\n");
300 fprintf (file, " delta ");
301 fprintf (file, HOST_WIDE_INT_PRINT_DEC, delta);
302 fprintf (file, "\n");
303 fprintf (file, " %s\n", write_p ? "write" : "read");
304 fprintf (file, "\n");
305 }
306
307 /* Dumps information about reference REF to FILE. */
308
309 static void
310 dump_mem_ref (FILE *file, struct mem_ref *ref)
311 {
312 fprintf (file, "Reference %p:\n", (void *) ref);
313
314 fprintf (file, " group %p ", (void *) ref->group);
315
316 dump_mem_details (file, ref->group->base, ref->group->step, ref->delta,
317 ref->write_p);
318 }
319
320 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
321 exist. */
322
323 static struct mem_ref_group *
324 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
325 {
326 struct mem_ref_group *group;
327
328 for (; *groups; groups = &(*groups)->next)
329 {
330 if (operand_equal_p ((*groups)->step, step, 0)
331 && operand_equal_p ((*groups)->base, base, 0))
332 return *groups;
333
334 /* If step is an integer constant, keep the list of groups sorted
335 by decreasing step. */
336 if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
337 && int_cst_value ((*groups)->step) < int_cst_value (step))
338 break;
339 }
340
341 group = XNEW (struct mem_ref_group);
342 group->base = base;
343 group->step = step;
344 group->refs = NULL;
345 group->next = *groups;
346 *groups = group;
347
348 return group;
349 }
350
351 /* Records a memory reference MEM in GROUP with offset DELTA and write status
352 WRITE_P. The reference occurs in statement STMT. */
353
354 static void
355 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
356 HOST_WIDE_INT delta, bool write_p)
357 {
358 struct mem_ref **aref;
359
360 /* Do not record the same address twice. */
361 for (aref = &group->refs; *aref; aref = &(*aref)->next)
362 {
363 /* It does not have to be possible for write reference to reuse the read
364 prefetch, or vice versa. */
365 if (!WRITE_CAN_USE_READ_PREFETCH
366 && write_p
367 && !(*aref)->write_p)
368 continue;
369 if (!READ_CAN_USE_WRITE_PREFETCH
370 && !write_p
371 && (*aref)->write_p)
372 continue;
373
374 if ((*aref)->delta == delta)
375 return;
376 }
377
378 (*aref) = XNEW (struct mem_ref);
379 (*aref)->stmt = stmt;
380 (*aref)->mem = mem;
381 (*aref)->delta = delta;
382 (*aref)->write_p = write_p;
383 (*aref)->prefetch_before = PREFETCH_ALL;
384 (*aref)->prefetch_mod = 1;
385 (*aref)->reuse_distance = 0;
386 (*aref)->issue_prefetch_p = false;
387 (*aref)->group = group;
388 (*aref)->next = NULL;
389 (*aref)->independent_p = false;
390 (*aref)->storent_p = false;
391
392 if (dump_file && (dump_flags & TDF_DETAILS))
393 dump_mem_ref (dump_file, *aref);
394 }
395
396 /* Release memory references in GROUPS. */
397
398 static void
399 release_mem_refs (struct mem_ref_group *groups)
400 {
401 struct mem_ref_group *next_g;
402 struct mem_ref *ref, *next_r;
403
404 for (; groups; groups = next_g)
405 {
406 next_g = groups->next;
407 for (ref = groups->refs; ref; ref = next_r)
408 {
409 next_r = ref->next;
410 free (ref);
411 }
412 free (groups);
413 }
414 }
415
416 /* A structure used to pass arguments to idx_analyze_ref. */
417
418 struct ar_data
419 {
420 struct loop *loop; /* Loop of the reference. */
421 gimple stmt; /* Statement of the reference. */
422 tree *step; /* Step of the memory reference. */
423 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
424 };
425
426 /* Analyzes a single INDEX of a memory reference to obtain information
427 described at analyze_ref. Callback for for_each_index. */
428
429 static bool
430 idx_analyze_ref (tree base, tree *index, void *data)
431 {
432 struct ar_data *ar_data = (struct ar_data *) data;
433 tree ibase, step, stepsize;
434 HOST_WIDE_INT idelta = 0, imult = 1;
435 affine_iv iv;
436
437 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
438 *index, &iv, true))
439 return false;
440 ibase = iv.base;
441 step = iv.step;
442
443 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
444 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
445 {
446 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
447 ibase = TREE_OPERAND (ibase, 0);
448 }
449 if (cst_and_fits_in_hwi (ibase))
450 {
451 idelta += int_cst_value (ibase);
452 ibase = build_int_cst (TREE_TYPE (ibase), 0);
453 }
454
455 if (TREE_CODE (base) == ARRAY_REF)
456 {
457 stepsize = array_ref_element_size (base);
458 if (!cst_and_fits_in_hwi (stepsize))
459 return false;
460 imult = int_cst_value (stepsize);
461 step = fold_build2 (MULT_EXPR, sizetype,
462 fold_convert (sizetype, step),
463 fold_convert (sizetype, stepsize));
464 idelta *= imult;
465 }
466
467 if (*ar_data->step == NULL_TREE)
468 *ar_data->step = step;
469 else
470 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
471 fold_convert (sizetype, *ar_data->step),
472 fold_convert (sizetype, step));
473 *ar_data->delta += idelta;
474 *index = ibase;
475
476 return true;
477 }
478
479 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
480 STEP are integer constants and iter is number of iterations of LOOP. The
481 reference occurs in statement STMT. Strips nonaddressable component
482 references from REF_P. */
483
484 static bool
485 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
486 tree *step, HOST_WIDE_INT *delta,
487 gimple stmt)
488 {
489 struct ar_data ar_data;
490 tree off;
491 HOST_WIDE_INT bit_offset;
492 tree ref = *ref_p;
493
494 *step = NULL_TREE;
495 *delta = 0;
496
497 /* First strip off the component references. Ignore bitfields.
498 Also strip off the real and imagine parts of a complex, so that
499 they can have the same base. */
500 if (TREE_CODE (ref) == REALPART_EXPR
501 || TREE_CODE (ref) == IMAGPART_EXPR
502 || (TREE_CODE (ref) == COMPONENT_REF
503 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1))))
504 {
505 if (TREE_CODE (ref) == IMAGPART_EXPR)
506 *delta += int_size_in_bytes (TREE_TYPE (ref));
507 ref = TREE_OPERAND (ref, 0);
508 }
509
510 *ref_p = ref;
511
512 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
513 {
514 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
515 bit_offset = TREE_INT_CST_LOW (off);
516 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
517
518 *delta += bit_offset / BITS_PER_UNIT;
519 }
520
521 *base = unshare_expr (ref);
522 ar_data.loop = loop;
523 ar_data.stmt = stmt;
524 ar_data.step = step;
525 ar_data.delta = delta;
526 return for_each_index (base, idx_analyze_ref, &ar_data);
527 }
528
529 /* Record a memory reference REF to the list REFS. The reference occurs in
530 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
531 reference was recorded, false otherwise. */
532
533 static bool
534 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
535 tree ref, bool write_p, gimple stmt)
536 {
537 tree base, step;
538 HOST_WIDE_INT delta;
539 struct mem_ref_group *agrp;
540
541 if (get_base_address (ref) == NULL)
542 return false;
543
544 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
545 return false;
546 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
547 if (step == NULL_TREE)
548 return false;
549
550 /* Stop if the address of BASE could not be taken. */
551 if (may_be_nonaddressable_p (base))
552 return false;
553
554 /* Limit non-constant step prefetching only to the innermost loops and
555 only when the step is loop invariant in the entire loop nest. */
556 if (!cst_and_fits_in_hwi (step))
557 {
558 if (loop->inner != NULL)
559 {
560 if (dump_file && (dump_flags & TDF_DETAILS))
561 {
562 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
563 print_generic_expr (dump_file, ref, TDF_TREE);
564 fprintf (dump_file,":");
565 dump_mem_details (dump_file, base, step, delta, write_p);
566 fprintf (dump_file,
567 "Ignoring %p, non-constant step prefetching is "
568 "limited to inner most loops \n",
569 (void *) ref);
570 }
571 return false;
572 }
573 else
574 {
575 if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
576 {
577 if (dump_file && (dump_flags & TDF_DETAILS))
578 {
579 fprintf (dump_file, "Memory expression %p\n",(void *) ref );
580 print_generic_expr (dump_file, ref, TDF_TREE);
581 fprintf (dump_file,":");
582 dump_mem_details (dump_file, base, step, delta, write_p);
583 fprintf (dump_file,
584 "Not prefetching, ignoring %p due to "
585 "loop variant step\n",
586 (void *) ref);
587 }
588 return false;
589 }
590 }
591 }
592
593 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
594 are integer constants. */
595 agrp = find_or_create_group (refs, base, step);
596 record_ref (agrp, stmt, ref, delta, write_p);
597
598 return true;
599 }
600
601 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
602 true if there are no other memory references inside the loop. */
603
604 static struct mem_ref_group *
605 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
606 {
607 basic_block *body = get_loop_body_in_dom_order (loop);
608 basic_block bb;
609 unsigned i;
610 gimple_stmt_iterator bsi;
611 gimple stmt;
612 tree lhs, rhs;
613 struct mem_ref_group *refs = NULL;
614
615 *no_other_refs = true;
616 *ref_count = 0;
617
618 /* Scan the loop body in order, so that the former references precede the
619 later ones. */
620 for (i = 0; i < loop->num_nodes; i++)
621 {
622 bb = body[i];
623 if (bb->loop_father != loop)
624 continue;
625
626 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
627 {
628 stmt = gsi_stmt (bsi);
629
630 if (gimple_code (stmt) != GIMPLE_ASSIGN)
631 {
632 if (gimple_vuse (stmt)
633 || (is_gimple_call (stmt)
634 && !(gimple_call_flags (stmt) & ECF_CONST)))
635 *no_other_refs = false;
636 continue;
637 }
638
639 lhs = gimple_assign_lhs (stmt);
640 rhs = gimple_assign_rhs1 (stmt);
641
642 if (REFERENCE_CLASS_P (rhs))
643 {
644 *no_other_refs &= gather_memory_references_ref (loop, &refs,
645 rhs, false, stmt);
646 *ref_count += 1;
647 }
648 if (REFERENCE_CLASS_P (lhs))
649 {
650 *no_other_refs &= gather_memory_references_ref (loop, &refs,
651 lhs, true, stmt);
652 *ref_count += 1;
653 }
654 }
655 }
656 free (body);
657
658 return refs;
659 }
660
661 /* Prune the prefetch candidate REF using the self-reuse. */
662
663 static void
664 prune_ref_by_self_reuse (struct mem_ref *ref)
665 {
666 HOST_WIDE_INT step;
667 bool backward;
668
669 /* If the step size is non constant, we cannot calculate prefetch_mod. */
670 if (!cst_and_fits_in_hwi (ref->group->step))
671 return;
672
673 step = int_cst_value (ref->group->step);
674
675 backward = step < 0;
676
677 if (step == 0)
678 {
679 /* Prefetch references to invariant address just once. */
680 ref->prefetch_before = 1;
681 return;
682 }
683
684 if (backward)
685 step = -step;
686
687 if (step > PREFETCH_BLOCK)
688 return;
689
690 if ((backward && HAVE_BACKWARD_PREFETCH)
691 || (!backward && HAVE_FORWARD_PREFETCH))
692 {
693 ref->prefetch_before = 1;
694 return;
695 }
696
697 ref->prefetch_mod = PREFETCH_BLOCK / step;
698 }
699
700 /* Divides X by BY, rounding down. */
701
702 static HOST_WIDE_INT
703 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
704 {
705 gcc_assert (by > 0);
706
707 if (x >= 0)
708 return x / by;
709 else
710 return (x + by - 1) / by;
711 }
712
713 /* Given a CACHE_LINE_SIZE and two inductive memory references
714 with a common STEP greater than CACHE_LINE_SIZE and an address
715 difference DELTA, compute the probability that they will fall
716 in different cache lines. Return true if the computed miss rate
717 is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
718 number of distinct iterations after which the pattern repeats itself.
719 ALIGN_UNIT is the unit of alignment in bytes. */
720
721 static bool
722 is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
723 HOST_WIDE_INT step, HOST_WIDE_INT delta,
724 unsigned HOST_WIDE_INT distinct_iters,
725 int align_unit)
726 {
727 unsigned align, iter;
728 int total_positions, miss_positions, max_allowed_miss_positions;
729 int address1, address2, cache_line1, cache_line2;
730
731 /* It always misses if delta is greater than or equal to the cache
732 line size. */
733 if (delta >= (HOST_WIDE_INT) cache_line_size)
734 return false;
735
736 miss_positions = 0;
737 total_positions = (cache_line_size / align_unit) * distinct_iters;
738 max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / 1000;
739
740 /* Iterate through all possible alignments of the first
741 memory reference within its cache line. */
742 for (align = 0; align < cache_line_size; align += align_unit)
743
744 /* Iterate through all distinct iterations. */
745 for (iter = 0; iter < distinct_iters; iter++)
746 {
747 address1 = align + step * iter;
748 address2 = address1 + delta;
749 cache_line1 = address1 / cache_line_size;
750 cache_line2 = address2 / cache_line_size;
751 if (cache_line1 != cache_line2)
752 {
753 miss_positions += 1;
754 if (miss_positions > max_allowed_miss_positions)
755 return false;
756 }
757 }
758 return true;
759 }
760
761 /* Prune the prefetch candidate REF using the reuse with BY.
762 If BY_IS_BEFORE is true, BY is before REF in the loop. */
763
764 static void
765 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
766 bool by_is_before)
767 {
768 HOST_WIDE_INT step;
769 bool backward;
770 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
771 HOST_WIDE_INT delta = delta_b - delta_r;
772 HOST_WIDE_INT hit_from;
773 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
774 HOST_WIDE_INT reduced_step;
775 unsigned HOST_WIDE_INT reduced_prefetch_block;
776 tree ref_type;
777 int align_unit;
778
779 /* If the step is non constant we cannot calculate prefetch_before. */
780 if (!cst_and_fits_in_hwi (ref->group->step)) {
781 return;
782 }
783
784 step = int_cst_value (ref->group->step);
785
786 backward = step < 0;
787
788
789 if (delta == 0)
790 {
791 /* If the references has the same address, only prefetch the
792 former. */
793 if (by_is_before)
794 ref->prefetch_before = 0;
795
796 return;
797 }
798
799 if (!step)
800 {
801 /* If the reference addresses are invariant and fall into the
802 same cache line, prefetch just the first one. */
803 if (!by_is_before)
804 return;
805
806 if (ddown (ref->delta, PREFETCH_BLOCK)
807 != ddown (by->delta, PREFETCH_BLOCK))
808 return;
809
810 ref->prefetch_before = 0;
811 return;
812 }
813
814 /* Only prune the reference that is behind in the array. */
815 if (backward)
816 {
817 if (delta > 0)
818 return;
819
820 /* Transform the data so that we may assume that the accesses
821 are forward. */
822 delta = - delta;
823 step = -step;
824 delta_r = PREFETCH_BLOCK - 1 - delta_r;
825 delta_b = PREFETCH_BLOCK - 1 - delta_b;
826 }
827 else
828 {
829 if (delta < 0)
830 return;
831 }
832
833 /* Check whether the two references are likely to hit the same cache
834 line, and how distant the iterations in that it occurs are from
835 each other. */
836
837 if (step <= PREFETCH_BLOCK)
838 {
839 /* The accesses are sure to meet. Let us check when. */
840 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
841 prefetch_before = (hit_from - delta_r + step - 1) / step;
842
843 /* Do not reduce prefetch_before if we meet beyond cache size. */
844 if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
845 prefetch_before = PREFETCH_ALL;
846 if (prefetch_before < ref->prefetch_before)
847 ref->prefetch_before = prefetch_before;
848
849 return;
850 }
851
852 /* A more complicated case with step > prefetch_block. First reduce
853 the ratio between the step and the cache line size to its simplest
854 terms. The resulting denominator will then represent the number of
855 distinct iterations after which each address will go back to its
856 initial location within the cache line. This computation assumes
857 that PREFETCH_BLOCK is a power of two. */
858 prefetch_block = PREFETCH_BLOCK;
859 reduced_prefetch_block = prefetch_block;
860 reduced_step = step;
861 while ((reduced_step & 1) == 0
862 && reduced_prefetch_block > 1)
863 {
864 reduced_step >>= 1;
865 reduced_prefetch_block >>= 1;
866 }
867
868 prefetch_before = delta / step;
869 delta %= step;
870 ref_type = TREE_TYPE (ref->mem);
871 align_unit = TYPE_ALIGN (ref_type) / 8;
872 if (is_miss_rate_acceptable (prefetch_block, step, delta,
873 reduced_prefetch_block, align_unit))
874 {
875 /* Do not reduce prefetch_before if we meet beyond cache size. */
876 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
877 prefetch_before = PREFETCH_ALL;
878 if (prefetch_before < ref->prefetch_before)
879 ref->prefetch_before = prefetch_before;
880
881 return;
882 }
883
884 /* Try also the following iteration. */
885 prefetch_before++;
886 delta = step - delta;
887 if (is_miss_rate_acceptable (prefetch_block, step, delta,
888 reduced_prefetch_block, align_unit))
889 {
890 if (prefetch_before < ref->prefetch_before)
891 ref->prefetch_before = prefetch_before;
892
893 return;
894 }
895
896 /* The ref probably does not reuse by. */
897 return;
898 }
899
900 /* Prune the prefetch candidate REF using the reuses with other references
901 in REFS. */
902
903 static void
904 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
905 {
906 struct mem_ref *prune_by;
907 bool before = true;
908
909 prune_ref_by_self_reuse (ref);
910
911 for (prune_by = refs; prune_by; prune_by = prune_by->next)
912 {
913 if (prune_by == ref)
914 {
915 before = false;
916 continue;
917 }
918
919 if (!WRITE_CAN_USE_READ_PREFETCH
920 && ref->write_p
921 && !prune_by->write_p)
922 continue;
923 if (!READ_CAN_USE_WRITE_PREFETCH
924 && !ref->write_p
925 && prune_by->write_p)
926 continue;
927
928 prune_ref_by_group_reuse (ref, prune_by, before);
929 }
930 }
931
932 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
933
934 static void
935 prune_group_by_reuse (struct mem_ref_group *group)
936 {
937 struct mem_ref *ref_pruned;
938
939 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
940 {
941 prune_ref_by_reuse (ref_pruned, group->refs);
942
943 if (dump_file && (dump_flags & TDF_DETAILS))
944 {
945 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
946
947 if (ref_pruned->prefetch_before == PREFETCH_ALL
948 && ref_pruned->prefetch_mod == 1)
949 fprintf (dump_file, " no restrictions");
950 else if (ref_pruned->prefetch_before == 0)
951 fprintf (dump_file, " do not prefetch");
952 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
953 fprintf (dump_file, " prefetch once");
954 else
955 {
956 if (ref_pruned->prefetch_before != PREFETCH_ALL)
957 {
958 fprintf (dump_file, " prefetch before ");
959 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
960 ref_pruned->prefetch_before);
961 }
962 if (ref_pruned->prefetch_mod != 1)
963 {
964 fprintf (dump_file, " prefetch mod ");
965 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
966 ref_pruned->prefetch_mod);
967 }
968 }
969 fprintf (dump_file, "\n");
970 }
971 }
972 }
973
974 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
975
976 static void
977 prune_by_reuse (struct mem_ref_group *groups)
978 {
979 for (; groups; groups = groups->next)
980 prune_group_by_reuse (groups);
981 }
982
983 /* Returns true if we should issue prefetch for REF. */
984
985 static bool
986 should_issue_prefetch_p (struct mem_ref *ref)
987 {
988 /* For now do not issue prefetches for only first few of the
989 iterations. */
990 if (ref->prefetch_before != PREFETCH_ALL)
991 {
992 if (dump_file && (dump_flags & TDF_DETAILS))
993 fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
994 (void *) ref);
995 return false;
996 }
997
998 /* Do not prefetch nontemporal stores. */
999 if (ref->storent_p)
1000 {
1001 if (dump_file && (dump_flags & TDF_DETAILS))
1002 fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
1003 return false;
1004 }
1005
1006 return true;
1007 }
1008
1009 /* Decide which of the prefetch candidates in GROUPS to prefetch.
1010 AHEAD is the number of iterations to prefetch ahead (which corresponds
1011 to the number of simultaneous instances of one prefetch running at a
1012 time). UNROLL_FACTOR is the factor by that the loop is going to be
1013 unrolled. Returns true if there is anything to prefetch. */
1014
1015 static bool
1016 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
1017 unsigned ahead)
1018 {
1019 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1020 unsigned slots_per_prefetch;
1021 struct mem_ref *ref;
1022 bool any = false;
1023
1024 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
1025 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
1026
1027 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
1028 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1029 it will need a prefetch slot. */
1030 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
1031 if (dump_file && (dump_flags & TDF_DETAILS))
1032 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
1033 slots_per_prefetch);
1034
1035 /* For now we just take memory references one by one and issue
1036 prefetches for as many as possible. The groups are sorted
1037 starting with the largest step, since the references with
1038 large step are more likely to cause many cache misses. */
1039
1040 for (; groups; groups = groups->next)
1041 for (ref = groups->refs; ref; ref = ref->next)
1042 {
1043 if (!should_issue_prefetch_p (ref))
1044 continue;
1045
1046 /* The loop is far from being sufficiently unrolled for this
1047 prefetch. Do not generate prefetch to avoid many redudant
1048 prefetches. */
1049 if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1050 continue;
1051
1052 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
1053 and we unroll the loop UNROLL_FACTOR times, we need to insert
1054 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1055 iteration. */
1056 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1057 / ref->prefetch_mod);
1058 prefetch_slots = n_prefetches * slots_per_prefetch;
1059
1060 /* If more than half of the prefetches would be lost anyway, do not
1061 issue the prefetch. */
1062 if (2 * remaining_prefetch_slots < prefetch_slots)
1063 continue;
1064
1065 ref->issue_prefetch_p = true;
1066
1067 if (remaining_prefetch_slots <= prefetch_slots)
1068 return true;
1069 remaining_prefetch_slots -= prefetch_slots;
1070 any = true;
1071 }
1072
1073 return any;
1074 }
1075
1076 /* Return TRUE if no prefetch is going to be generated in the given
1077 GROUPS. */
1078
1079 static bool
1080 nothing_to_prefetch_p (struct mem_ref_group *groups)
1081 {
1082 struct mem_ref *ref;
1083
1084 for (; groups; groups = groups->next)
1085 for (ref = groups->refs; ref; ref = ref->next)
1086 if (should_issue_prefetch_p (ref))
1087 return false;
1088
1089 return true;
1090 }
1091
1092 /* Estimate the number of prefetches in the given GROUPS.
1093 UNROLL_FACTOR is the factor by which LOOP was unrolled. */
1094
1095 static int
1096 estimate_prefetch_count (struct mem_ref_group *groups, unsigned unroll_factor)
1097 {
1098 struct mem_ref *ref;
1099 unsigned n_prefetches;
1100 int prefetch_count = 0;
1101
1102 for (; groups; groups = groups->next)
1103 for (ref = groups->refs; ref; ref = ref->next)
1104 if (should_issue_prefetch_p (ref))
1105 {
1106 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1107 / ref->prefetch_mod);
1108 prefetch_count += n_prefetches;
1109 }
1110
1111 return prefetch_count;
1112 }
1113
1114 /* Issue prefetches for the reference REF into loop as decided before.
1115 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1116 is the factor by which LOOP was unrolled. */
1117
1118 static void
1119 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1120 {
1121 HOST_WIDE_INT delta;
1122 tree addr, addr_base, write_p, local, forward;
1123 gimple prefetch;
1124 gimple_stmt_iterator bsi;
1125 unsigned n_prefetches, ap;
1126 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1127
1128 if (dump_file && (dump_flags & TDF_DETAILS))
1129 fprintf (dump_file, "Issued%s prefetch for %p.\n",
1130 nontemporal ? " nontemporal" : "",
1131 (void *) ref);
1132
1133 bsi = gsi_for_stmt (ref->stmt);
1134
1135 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1136 / ref->prefetch_mod);
1137 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1138 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1139 true, NULL, true, GSI_SAME_STMT);
1140 write_p = ref->write_p ? integer_one_node : integer_zero_node;
1141 local = nontemporal ? integer_zero_node : integer_three_node;
1142
1143 for (ap = 0; ap < n_prefetches; ap++)
1144 {
1145 if (cst_and_fits_in_hwi (ref->group->step))
1146 {
1147 /* Determine the address to prefetch. */
1148 delta = (ahead + ap * ref->prefetch_mod) *
1149 int_cst_value (ref->group->step);
1150 addr = fold_build_pointer_plus_hwi (addr_base, delta);
1151 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1152 true, GSI_SAME_STMT);
1153 }
1154 else
1155 {
1156 /* The step size is non-constant but loop-invariant. We use the
1157 heuristic to simply prefetch ahead iterations ahead. */
1158 forward = fold_build2 (MULT_EXPR, sizetype,
1159 fold_convert (sizetype, ref->group->step),
1160 fold_convert (sizetype, size_int (ahead)));
1161 addr = fold_build_pointer_plus (addr_base, forward);
1162 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1163 NULL, true, GSI_SAME_STMT);
1164 }
1165 /* Create the prefetch instruction. */
1166 prefetch = gimple_build_call (builtin_decl_explicit (BUILT_IN_PREFETCH),
1167 3, addr, write_p, local);
1168 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1169 }
1170 }
1171
1172 /* Issue prefetches for the references in GROUPS into loop as decided before.
1173 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1174 factor by that LOOP was unrolled. */
1175
1176 static void
1177 issue_prefetches (struct mem_ref_group *groups,
1178 unsigned unroll_factor, unsigned ahead)
1179 {
1180 struct mem_ref *ref;
1181
1182 for (; groups; groups = groups->next)
1183 for (ref = groups->refs; ref; ref = ref->next)
1184 if (ref->issue_prefetch_p)
1185 issue_prefetch_ref (ref, unroll_factor, ahead);
1186 }
1187
1188 /* Returns true if REF is a memory write for that a nontemporal store insn
1189 can be used. */
1190
1191 static bool
1192 nontemporal_store_p (struct mem_ref *ref)
1193 {
1194 enum machine_mode mode;
1195 enum insn_code code;
1196
1197 /* REF must be a write that is not reused. We require it to be independent
1198 on all other memory references in the loop, as the nontemporal stores may
1199 be reordered with respect to other memory references. */
1200 if (!ref->write_p
1201 || !ref->independent_p
1202 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1203 return false;
1204
1205 /* Check that we have the storent instruction for the mode. */
1206 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1207 if (mode == BLKmode)
1208 return false;
1209
1210 code = optab_handler (storent_optab, mode);
1211 return code != CODE_FOR_nothing;
1212 }
1213
1214 /* If REF is a nontemporal store, we mark the corresponding modify statement
1215 and return true. Otherwise, we return false. */
1216
1217 static bool
1218 mark_nontemporal_store (struct mem_ref *ref)
1219 {
1220 if (!nontemporal_store_p (ref))
1221 return false;
1222
1223 if (dump_file && (dump_flags & TDF_DETAILS))
1224 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1225 (void *) ref);
1226
1227 gimple_assign_set_nontemporal_move (ref->stmt, true);
1228 ref->storent_p = true;
1229
1230 return true;
1231 }
1232
1233 /* Issue a memory fence instruction after LOOP. */
1234
1235 static void
1236 emit_mfence_after_loop (struct loop *loop)
1237 {
1238 vec<edge> exits = get_loop_exit_edges (loop);
1239 edge exit;
1240 gimple call;
1241 gimple_stmt_iterator bsi;
1242 unsigned i;
1243
1244 FOR_EACH_VEC_ELT (exits, i, exit)
1245 {
1246 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1247
1248 if (!single_pred_p (exit->dest)
1249 /* If possible, we prefer not to insert the fence on other paths
1250 in cfg. */
1251 && !(exit->flags & EDGE_ABNORMAL))
1252 split_loop_exit_edge (exit);
1253 bsi = gsi_after_labels (exit->dest);
1254
1255 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1256 }
1257
1258 exits.release ();
1259 update_ssa (TODO_update_ssa_only_virtuals);
1260 }
1261
1262 /* Returns true if we can use storent in loop, false otherwise. */
1263
1264 static bool
1265 may_use_storent_in_loop_p (struct loop *loop)
1266 {
1267 bool ret = true;
1268
1269 if (loop->inner != NULL)
1270 return false;
1271
1272 /* If we must issue a mfence insn after using storent, check that there
1273 is a suitable place for it at each of the loop exits. */
1274 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1275 {
1276 vec<edge> exits = get_loop_exit_edges (loop);
1277 unsigned i;
1278 edge exit;
1279
1280 FOR_EACH_VEC_ELT (exits, i, exit)
1281 if ((exit->flags & EDGE_ABNORMAL)
1282 && exit->dest == EXIT_BLOCK_PTR)
1283 ret = false;
1284
1285 exits.release ();
1286 }
1287
1288 return ret;
1289 }
1290
1291 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1292 references in the loop. */
1293
1294 static void
1295 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1296 {
1297 struct mem_ref *ref;
1298 bool any = false;
1299
1300 if (!may_use_storent_in_loop_p (loop))
1301 return;
1302
1303 for (; groups; groups = groups->next)
1304 for (ref = groups->refs; ref; ref = ref->next)
1305 any |= mark_nontemporal_store (ref);
1306
1307 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1308 emit_mfence_after_loop (loop);
1309 }
1310
1311 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1312 this is the case, fill in DESC by the description of number of
1313 iterations. */
1314
1315 static bool
1316 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1317 unsigned factor)
1318 {
1319 if (!can_unroll_loop_p (loop, factor, desc))
1320 return false;
1321
1322 /* We only consider loops without control flow for unrolling. This is not
1323 a hard restriction -- tree_unroll_loop works with arbitrary loops
1324 as well; but the unrolling/prefetching is usually more profitable for
1325 loops consisting of a single basic block, and we want to limit the
1326 code growth. */
1327 if (loop->num_nodes > 2)
1328 return false;
1329
1330 return true;
1331 }
1332
1333 /* Determine the coefficient by that unroll LOOP, from the information
1334 contained in the list of memory references REFS. Description of
1335 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1336 insns of the LOOP. EST_NITER is the estimated number of iterations of
1337 the loop, or -1 if no estimate is available. */
1338
1339 static unsigned
1340 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1341 unsigned ninsns, struct tree_niter_desc *desc,
1342 HOST_WIDE_INT est_niter)
1343 {
1344 unsigned upper_bound;
1345 unsigned nfactor, factor, mod_constraint;
1346 struct mem_ref_group *agp;
1347 struct mem_ref *ref;
1348
1349 /* First check whether the loop is not too large to unroll. We ignore
1350 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1351 from unrolling them enough to make exactly one cache line covered by each
1352 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1353 us from unrolling the loops too many times in cases where we only expect
1354 gains from better scheduling and decreasing loop overhead, which is not
1355 the case here. */
1356 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1357
1358 /* If we unrolled the loop more times than it iterates, the unrolled version
1359 of the loop would be never entered. */
1360 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1361 upper_bound = est_niter;
1362
1363 if (upper_bound <= 1)
1364 return 1;
1365
1366 /* Choose the factor so that we may prefetch each cache just once,
1367 but bound the unrolling by UPPER_BOUND. */
1368 factor = 1;
1369 for (agp = refs; agp; agp = agp->next)
1370 for (ref = agp->refs; ref; ref = ref->next)
1371 if (should_issue_prefetch_p (ref))
1372 {
1373 mod_constraint = ref->prefetch_mod;
1374 nfactor = least_common_multiple (mod_constraint, factor);
1375 if (nfactor <= upper_bound)
1376 factor = nfactor;
1377 }
1378
1379 if (!should_unroll_loop_p (loop, desc, factor))
1380 return 1;
1381
1382 return factor;
1383 }
1384
1385 /* Returns the total volume of the memory references REFS, taking into account
1386 reuses in the innermost loop and cache line size. TODO -- we should also
1387 take into account reuses across the iterations of the loops in the loop
1388 nest. */
1389
1390 static unsigned
1391 volume_of_references (struct mem_ref_group *refs)
1392 {
1393 unsigned volume = 0;
1394 struct mem_ref_group *gr;
1395 struct mem_ref *ref;
1396
1397 for (gr = refs; gr; gr = gr->next)
1398 for (ref = gr->refs; ref; ref = ref->next)
1399 {
1400 /* Almost always reuses another value? */
1401 if (ref->prefetch_before != PREFETCH_ALL)
1402 continue;
1403
1404 /* If several iterations access the same cache line, use the size of
1405 the line divided by this number. Otherwise, a cache line is
1406 accessed in each iteration. TODO -- in the latter case, we should
1407 take the size of the reference into account, rounding it up on cache
1408 line size multiple. */
1409 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1410 }
1411 return volume;
1412 }
1413
1414 /* Returns the volume of memory references accessed across VEC iterations of
1415 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1416 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1417
1418 static unsigned
1419 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1420 {
1421 unsigned i;
1422
1423 for (i = 0; i < n; i++)
1424 if (vec[i] != 0)
1425 break;
1426
1427 if (i == n)
1428 return 0;
1429
1430 gcc_assert (vec[i] > 0);
1431
1432 /* We ignore the parts of the distance vector in subloops, since usually
1433 the numbers of iterations are much smaller. */
1434 return loop_sizes[i] * vec[i];
1435 }
1436
1437 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1438 at the position corresponding to the loop of the step. N is the depth
1439 of the considered loop nest, and, LOOP is its innermost loop. */
1440
1441 static void
1442 add_subscript_strides (tree access_fn, unsigned stride,
1443 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1444 {
1445 struct loop *aloop;
1446 tree step;
1447 HOST_WIDE_INT astep;
1448 unsigned min_depth = loop_depth (loop) - n;
1449
1450 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1451 {
1452 aloop = get_chrec_loop (access_fn);
1453 step = CHREC_RIGHT (access_fn);
1454 access_fn = CHREC_LEFT (access_fn);
1455
1456 if ((unsigned) loop_depth (aloop) <= min_depth)
1457 continue;
1458
1459 if (host_integerp (step, 0))
1460 astep = tree_low_cst (step, 0);
1461 else
1462 astep = L1_CACHE_LINE_SIZE;
1463
1464 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1465
1466 }
1467 }
1468
1469 /* Returns the volume of memory references accessed between two consecutive
1470 self-reuses of the reference DR. We consider the subscripts of DR in N
1471 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1472 loops. LOOP is the innermost loop of the current loop nest. */
1473
1474 static unsigned
1475 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1476 struct loop *loop)
1477 {
1478 tree stride, access_fn;
1479 HOST_WIDE_INT *strides, astride;
1480 vec<tree> access_fns;
1481 tree ref = DR_REF (dr);
1482 unsigned i, ret = ~0u;
1483
1484 /* In the following example:
1485
1486 for (i = 0; i < N; i++)
1487 for (j = 0; j < N; j++)
1488 use (a[j][i]);
1489 the same cache line is accessed each N steps (except if the change from
1490 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1491 we cannot rely purely on the results of the data dependence analysis.
1492
1493 Instead, we compute the stride of the reference in each loop, and consider
1494 the innermost loop in that the stride is less than cache size. */
1495
1496 strides = XCNEWVEC (HOST_WIDE_INT, n);
1497 access_fns = DR_ACCESS_FNS (dr);
1498
1499 FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1500 {
1501 /* Keep track of the reference corresponding to the subscript, so that we
1502 know its stride. */
1503 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1504 ref = TREE_OPERAND (ref, 0);
1505
1506 if (TREE_CODE (ref) == ARRAY_REF)
1507 {
1508 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1509 if (host_integerp (stride, 1))
1510 astride = tree_low_cst (stride, 1);
1511 else
1512 astride = L1_CACHE_LINE_SIZE;
1513
1514 ref = TREE_OPERAND (ref, 0);
1515 }
1516 else
1517 astride = 1;
1518
1519 add_subscript_strides (access_fn, astride, strides, n, loop);
1520 }
1521
1522 for (i = n; i-- > 0; )
1523 {
1524 unsigned HOST_WIDE_INT s;
1525
1526 s = strides[i] < 0 ? -strides[i] : strides[i];
1527
1528 if (s < (unsigned) L1_CACHE_LINE_SIZE
1529 && (loop_sizes[i]
1530 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1531 {
1532 ret = loop_sizes[i];
1533 break;
1534 }
1535 }
1536
1537 free (strides);
1538 return ret;
1539 }
1540
1541 /* Determines the distance till the first reuse of each reference in REFS
1542 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1543 memory references in the loop. Return false if the analysis fails. */
1544
1545 static bool
1546 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1547 bool no_other_refs)
1548 {
1549 struct loop *nest, *aloop;
1550 vec<data_reference_p> datarefs = vNULL;
1551 vec<ddr_p> dependences = vNULL;
1552 struct mem_ref_group *gr;
1553 struct mem_ref *ref, *refb;
1554 vec<loop_p> vloops = vNULL;
1555 unsigned *loop_data_size;
1556 unsigned i, j, n;
1557 unsigned volume, dist, adist;
1558 HOST_WIDE_INT vol;
1559 data_reference_p dr;
1560 ddr_p dep;
1561
1562 if (loop->inner)
1563 return true;
1564
1565 /* Find the outermost loop of the loop nest of loop (we require that
1566 there are no sibling loops inside the nest). */
1567 nest = loop;
1568 while (1)
1569 {
1570 aloop = loop_outer (nest);
1571
1572 if (aloop == current_loops->tree_root
1573 || aloop->inner->next)
1574 break;
1575
1576 nest = aloop;
1577 }
1578
1579 /* For each loop, determine the amount of data accessed in each iteration.
1580 We use this to estimate whether the reference is evicted from the
1581 cache before its reuse. */
1582 find_loop_nest (nest, &vloops);
1583 n = vloops.length ();
1584 loop_data_size = XNEWVEC (unsigned, n);
1585 volume = volume_of_references (refs);
1586 i = n;
1587 while (i-- != 0)
1588 {
1589 loop_data_size[i] = volume;
1590 /* Bound the volume by the L2 cache size, since above this bound,
1591 all dependence distances are equivalent. */
1592 if (volume > L2_CACHE_SIZE_BYTES)
1593 continue;
1594
1595 aloop = vloops[i];
1596 vol = estimated_stmt_executions_int (aloop);
1597 if (vol == -1)
1598 vol = expected_loop_iterations (aloop);
1599 volume *= vol;
1600 }
1601
1602 /* Prepare the references in the form suitable for data dependence
1603 analysis. We ignore unanalyzable data references (the results
1604 are used just as a heuristics to estimate temporality of the
1605 references, hence we do not need to worry about correctness). */
1606 for (gr = refs; gr; gr = gr->next)
1607 for (ref = gr->refs; ref; ref = ref->next)
1608 {
1609 dr = create_data_ref (nest, loop_containing_stmt (ref->stmt),
1610 ref->mem, ref->stmt, !ref->write_p);
1611
1612 if (dr)
1613 {
1614 ref->reuse_distance = volume;
1615 dr->aux = ref;
1616 datarefs.safe_push (dr);
1617 }
1618 else
1619 no_other_refs = false;
1620 }
1621
1622 FOR_EACH_VEC_ELT (datarefs, i, dr)
1623 {
1624 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1625 ref = (struct mem_ref *) dr->aux;
1626 if (ref->reuse_distance > dist)
1627 ref->reuse_distance = dist;
1628
1629 if (no_other_refs)
1630 ref->independent_p = true;
1631 }
1632
1633 if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1634 return false;
1635
1636 FOR_EACH_VEC_ELT (dependences, i, dep)
1637 {
1638 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1639 continue;
1640
1641 ref = (struct mem_ref *) DDR_A (dep)->aux;
1642 refb = (struct mem_ref *) DDR_B (dep)->aux;
1643
1644 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1645 || DDR_NUM_DIST_VECTS (dep) == 0)
1646 {
1647 /* If the dependence cannot be analyzed, assume that there might be
1648 a reuse. */
1649 dist = 0;
1650
1651 ref->independent_p = false;
1652 refb->independent_p = false;
1653 }
1654 else
1655 {
1656 /* The distance vectors are normalized to be always lexicographically
1657 positive, hence we cannot tell just from them whether DDR_A comes
1658 before DDR_B or vice versa. However, it is not important,
1659 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1660 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1661 in cache (and marking it as nontemporal would not affect
1662 anything). */
1663
1664 dist = volume;
1665 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1666 {
1667 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1668 loop_data_size, n);
1669
1670 /* If this is a dependence in the innermost loop (i.e., the
1671 distances in all superloops are zero) and it is not
1672 the trivial self-dependence with distance zero, record that
1673 the references are not completely independent. */
1674 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1675 && (ref != refb
1676 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1677 {
1678 ref->independent_p = false;
1679 refb->independent_p = false;
1680 }
1681
1682 /* Ignore accesses closer than
1683 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1684 so that we use nontemporal prefetches e.g. if single memory
1685 location is accessed several times in a single iteration of
1686 the loop. */
1687 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1688 continue;
1689
1690 if (adist < dist)
1691 dist = adist;
1692 }
1693 }
1694
1695 if (ref->reuse_distance > dist)
1696 ref->reuse_distance = dist;
1697 if (refb->reuse_distance > dist)
1698 refb->reuse_distance = dist;
1699 }
1700
1701 free_dependence_relations (dependences);
1702 free_data_refs (datarefs);
1703 free (loop_data_size);
1704
1705 if (dump_file && (dump_flags & TDF_DETAILS))
1706 {
1707 fprintf (dump_file, "Reuse distances:\n");
1708 for (gr = refs; gr; gr = gr->next)
1709 for (ref = gr->refs; ref; ref = ref->next)
1710 fprintf (dump_file, " ref %p distance %u\n",
1711 (void *) ref, ref->reuse_distance);
1712 }
1713
1714 return true;
1715 }
1716
1717 /* Determine whether or not the trip count to ahead ratio is too small based
1718 on prefitablility consideration.
1719 AHEAD: the iteration ahead distance,
1720 EST_NITER: the estimated trip count. */
1721
1722 static bool
1723 trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1724 {
1725 /* Assume trip count to ahead ratio is big enough if the trip count could not
1726 be estimated at compile time. */
1727 if (est_niter < 0)
1728 return false;
1729
1730 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1731 {
1732 if (dump_file && (dump_flags & TDF_DETAILS))
1733 fprintf (dump_file,
1734 "Not prefetching -- loop estimated to roll only %d times\n",
1735 (int) est_niter);
1736 return true;
1737 }
1738
1739 return false;
1740 }
1741
1742 /* Determine whether or not the number of memory references in the loop is
1743 reasonable based on the profitablity and compilation time considerations.
1744 NINSNS: estimated number of instructions in the loop,
1745 MEM_REF_COUNT: total number of memory references in the loop. */
1746
1747 static bool
1748 mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1749 {
1750 int insn_to_mem_ratio;
1751
1752 if (mem_ref_count == 0)
1753 return false;
1754
1755 /* Miss rate computation (is_miss_rate_acceptable) and dependence analysis
1756 (compute_all_dependences) have high costs based on quadratic complexity.
1757 To avoid huge compilation time, we give up prefetching if mem_ref_count
1758 is too large. */
1759 if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1760 return false;
1761
1762 /* Prefetching improves performance by overlapping cache missing
1763 memory accesses with CPU operations. If the loop does not have
1764 enough CPU operations to overlap with memory operations, prefetching
1765 won't give a significant benefit. One approximate way of checking
1766 this is to require the ratio of instructions to memory references to
1767 be above a certain limit. This approximation works well in practice.
1768 TODO: Implement a more precise computation by estimating the time
1769 for each CPU or memory op in the loop. Time estimates for memory ops
1770 should account for cache misses. */
1771 insn_to_mem_ratio = ninsns / mem_ref_count;
1772
1773 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1774 {
1775 if (dump_file && (dump_flags & TDF_DETAILS))
1776 fprintf (dump_file,
1777 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1778 insn_to_mem_ratio);
1779 return false;
1780 }
1781
1782 return true;
1783 }
1784
1785 /* Determine whether or not the instruction to prefetch ratio in the loop is
1786 too small based on the profitablity consideration.
1787 NINSNS: estimated number of instructions in the loop,
1788 PREFETCH_COUNT: an estimate of the number of prefetches,
1789 UNROLL_FACTOR: the factor to unroll the loop if prefetching. */
1790
1791 static bool
1792 insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1793 unsigned unroll_factor)
1794 {
1795 int insn_to_prefetch_ratio;
1796
1797 /* Prefetching most likely causes performance degradation when the instruction
1798 to prefetch ratio is too small. Too many prefetch instructions in a loop
1799 may reduce the I-cache performance.
1800 (unroll_factor * ninsns) is used to estimate the number of instructions in
1801 the unrolled loop. This implementation is a bit simplistic -- the number
1802 of issued prefetch instructions is also affected by unrolling. So,
1803 prefetch_mod and the unroll factor should be taken into account when
1804 determining prefetch_count. Also, the number of insns of the unrolled
1805 loop will usually be significantly smaller than the number of insns of the
1806 original loop * unroll_factor (at least the induction variable increases
1807 and the exit branches will get eliminated), so it might be better to use
1808 tree_estimate_loop_size + estimated_unrolled_size. */
1809 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1810 if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1811 {
1812 if (dump_file && (dump_flags & TDF_DETAILS))
1813 fprintf (dump_file,
1814 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1815 insn_to_prefetch_ratio);
1816 return true;
1817 }
1818
1819 return false;
1820 }
1821
1822
1823 /* Issue prefetch instructions for array references in LOOP. Returns
1824 true if the LOOP was unrolled. */
1825
1826 static bool
1827 loop_prefetch_arrays (struct loop *loop)
1828 {
1829 struct mem_ref_group *refs;
1830 unsigned ahead, ninsns, time, unroll_factor;
1831 HOST_WIDE_INT est_niter;
1832 struct tree_niter_desc desc;
1833 bool unrolled = false, no_other_refs;
1834 unsigned prefetch_count;
1835 unsigned mem_ref_count;
1836
1837 if (optimize_loop_nest_for_size_p (loop))
1838 {
1839 if (dump_file && (dump_flags & TDF_DETAILS))
1840 fprintf (dump_file, " ignored (cold area)\n");
1841 return false;
1842 }
1843
1844 /* FIXME: the time should be weighted by the probabilities of the blocks in
1845 the loop body. */
1846 time = tree_num_loop_insns (loop, &eni_time_weights);
1847 if (time == 0)
1848 return false;
1849
1850 ahead = (PREFETCH_LATENCY + time - 1) / time;
1851 est_niter = estimated_stmt_executions_int (loop);
1852 if (est_niter == -1)
1853 est_niter = max_stmt_executions_int (loop);
1854
1855 /* Prefetching is not likely to be profitable if the trip count to ahead
1856 ratio is too small. */
1857 if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1858 return false;
1859
1860 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1861
1862 /* Step 1: gather the memory references. */
1863 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1864
1865 /* Give up prefetching if the number of memory references in the
1866 loop is not reasonable based on profitablity and compilation time
1867 considerations. */
1868 if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1869 goto fail;
1870
1871 /* Step 2: estimate the reuse effects. */
1872 prune_by_reuse (refs);
1873
1874 if (nothing_to_prefetch_p (refs))
1875 goto fail;
1876
1877 if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1878 goto fail;
1879
1880 /* Step 3: determine unroll factor. */
1881 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1882 est_niter);
1883
1884 /* Estimate prefetch count for the unrolled loop. */
1885 prefetch_count = estimate_prefetch_count (refs, unroll_factor);
1886 if (prefetch_count == 0)
1887 goto fail;
1888
1889 if (dump_file && (dump_flags & TDF_DETAILS))
1890 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1891 HOST_WIDE_INT_PRINT_DEC "\n"
1892 "insn count %d, mem ref count %d, prefetch count %d\n",
1893 ahead, unroll_factor, est_niter,
1894 ninsns, mem_ref_count, prefetch_count);
1895
1896 /* Prefetching is not likely to be profitable if the instruction to prefetch
1897 ratio is too small. */
1898 if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1899 unroll_factor))
1900 goto fail;
1901
1902 mark_nontemporal_stores (loop, refs);
1903
1904 /* Step 4: what to prefetch? */
1905 if (!schedule_prefetches (refs, unroll_factor, ahead))
1906 goto fail;
1907
1908 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1909 iterations so that we do not issue superfluous prefetches. */
1910 if (unroll_factor != 1)
1911 {
1912 tree_unroll_loop (loop, unroll_factor,
1913 single_dom_exit (loop), &desc);
1914 unrolled = true;
1915 }
1916
1917 /* Step 6: issue the prefetches. */
1918 issue_prefetches (refs, unroll_factor, ahead);
1919
1920 fail:
1921 release_mem_refs (refs);
1922 return unrolled;
1923 }
1924
1925 /* Issue prefetch instructions for array references in loops. */
1926
1927 unsigned int
1928 tree_ssa_prefetch_arrays (void)
1929 {
1930 loop_iterator li;
1931 struct loop *loop;
1932 bool unrolled = false;
1933 int todo_flags = 0;
1934
1935 if (!HAVE_prefetch
1936 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1937 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1938 of processor costs and i486 does not have prefetch, but
1939 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1940 || PREFETCH_BLOCK == 0)
1941 return 0;
1942
1943 if (dump_file && (dump_flags & TDF_DETAILS))
1944 {
1945 fprintf (dump_file, "Prefetching parameters:\n");
1946 fprintf (dump_file, " simultaneous prefetches: %d\n",
1947 SIMULTANEOUS_PREFETCHES);
1948 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1949 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1950 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1951 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1952 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1953 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1954 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1955 MIN_INSN_TO_PREFETCH_RATIO);
1956 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1957 PREFETCH_MIN_INSN_TO_MEM_RATIO);
1958 fprintf (dump_file, "\n");
1959 }
1960
1961 initialize_original_copy_tables ();
1962
1963 if (!builtin_decl_explicit_p (BUILT_IN_PREFETCH))
1964 {
1965 tree type = build_function_type_list (void_type_node,
1966 const_ptr_type_node, NULL_TREE);
1967 tree decl = add_builtin_function ("__builtin_prefetch", type,
1968 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1969 NULL, NULL_TREE);
1970 DECL_IS_NOVOPS (decl) = true;
1971 set_builtin_decl (BUILT_IN_PREFETCH, decl, false);
1972 }
1973
1974 /* We assume that size of cache line is a power of two, so verify this
1975 here. */
1976 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1977
1978 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1979 {
1980 if (dump_file && (dump_flags & TDF_DETAILS))
1981 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1982
1983 unrolled |= loop_prefetch_arrays (loop);
1984
1985 if (dump_file && (dump_flags & TDF_DETAILS))
1986 fprintf (dump_file, "\n\n");
1987 }
1988
1989 if (unrolled)
1990 {
1991 scev_reset ();
1992 todo_flags |= TODO_cleanup_cfg;
1993 }
1994
1995 free_original_copy_tables ();
1996 return todo_flags;
1997 }
1998
1999 /* Prefetching. */
2000
2001 static unsigned int
2002 tree_ssa_loop_prefetch (void)
2003 {
2004 if (number_of_loops (cfun) <= 1)
2005 return 0;
2006
2007 return tree_ssa_prefetch_arrays ();
2008 }
2009
2010 static bool
2011 gate_tree_ssa_loop_prefetch (void)
2012 {
2013 return flag_prefetch_loop_arrays > 0;
2014 }
2015
2016 namespace {
2017
2018 const pass_data pass_data_loop_prefetch =
2019 {
2020 GIMPLE_PASS, /* type */
2021 "aprefetch", /* name */
2022 OPTGROUP_LOOP, /* optinfo_flags */
2023 true, /* has_gate */
2024 true, /* has_execute */
2025 TV_TREE_PREFETCH, /* tv_id */
2026 ( PROP_cfg | PROP_ssa ), /* properties_required */
2027 0, /* properties_provided */
2028 0, /* properties_destroyed */
2029 0, /* todo_flags_start */
2030 0, /* todo_flags_finish */
2031 };
2032
2033 class pass_loop_prefetch : public gimple_opt_pass
2034 {
2035 public:
2036 pass_loop_prefetch (gcc::context *ctxt)
2037 : gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2038 {}
2039
2040 /* opt_pass methods: */
2041 bool gate () { return gate_tree_ssa_loop_prefetch (); }
2042 unsigned int execute () { return tree_ssa_loop_prefetch (); }
2043
2044 }; // class pass_loop_prefetch
2045
2046 } // anon namespace
2047
2048 gimple_opt_pass *
2049 make_pass_loop_prefetch (gcc::context *ctxt)
2050 {
2051 return new pass_loop_prefetch (ctxt);
2052 }
2053
2054