tree-ssa-loop-prefetch.c (determine_unroll_factor): Bound the unroll factor by the...
[gcc.git] / gcc / tree-ssa-loop-prefetch.c
1 /* Array prefetching.
2 Copyright (C) 2005 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 2, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING. If not, write to the Free
18 Software Foundation, 59 Temple Place - Suite 330, Boston, MA
19 02111-1307, USA. */
20
21 #include "config.h"
22 #include "system.h"
23 #include "coretypes.h"
24 #include "tm.h"
25 #include "tree.h"
26 #include "rtl.h"
27 #include "tm_p.h"
28 #include "hard-reg-set.h"
29 #include "basic-block.h"
30 #include "output.h"
31 #include "diagnostic.h"
32 #include "tree-flow.h"
33 #include "tree-dump.h"
34 #include "timevar.h"
35 #include "cfgloop.h"
36 #include "varray.h"
37 #include "expr.h"
38 #include "tree-pass.h"
39 #include "ggc.h"
40 #include "insn-config.h"
41 #include "recog.h"
42 #include "hashtab.h"
43 #include "tree-chrec.h"
44 #include "tree-scalar-evolution.h"
45 #include "toplev.h"
46 #include "params.h"
47 #include "langhooks.h"
48 #include "tree-inline.h"
49
50 /* This pass inserts prefetch instructions to optimize cache usage during
51 accesses to arrays in loops. It processes loops sequentially and:
52
53 1) Gathers all memory references in the single loop.
54 2) For each of the references it decides when it is profitable to prefetch
55 it. To do it, we evaluate the reuse among the accesses, and determines
56 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
57 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
58 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
59 iterations of the loop that are zero modulo PREFETCH_MOD). For example
60 (assuming cache line size is 64 bytes, char has size 1 byte and there
61 is no hardware sequential prefetch):
62
63 char *a;
64 for (i = 0; i < max; i++)
65 {
66 a[255] = ...; (0)
67 a[i] = ...; (1)
68 a[i + 64] = ...; (2)
69 a[16*i] = ...; (3)
70 a[187*i] = ...; (4)
71 a[187*i + 50] = ...; (5)
72 }
73
74 (0) obviously has PREFETCH_BEFORE 1
75 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
76 location 64 iterations before it, and PREFETCH_MOD 64 (since
77 it hits the same cache line otherwise).
78 (2) has PREFETCH_MOD 64
79 (3) has PREFETCH_MOD 4
80 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
81 the cache line accessed by (4) is the same with probability only
82 7/32.
83 (5) has PREFETCH_MOD 1 as well.
84
85 3) We determine how much ahead we need to prefetch. The number of
86 iterations needed is time to fetch / time spent in one iteration of
87 the loop. The problem is that we do not know either of these values,
88 so we just make a heuristic guess based on a magic (possibly)
89 target-specific constant and size of the loop.
90
91 4) Determine which of the references we prefetch. We take into account
92 that there is a maximum number of simultaneous prefetches (provided
93 by machine description). We prefetch as many prefetches as possible
94 while still within this bound (starting with those with lowest
95 prefetch_mod, since they are responsible for most of the cache
96 misses).
97
98 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
99 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
100 prefetching nonaccessed memory.
101 TODO -- actually implement peeling.
102
103 6) We actually emit the prefetch instructions. ??? Perhaps emit the
104 prefetch instructions with guards in cases where 5) was not sufficient
105 to satisfy the constraints?
106
107 Some other TODO:
108 -- write and use more general reuse analysis (that could be also used
109 in other cache aimed loop optimizations)
110 -- make it behave sanely together with the prefetches given by user
111 (now we just ignore them; at the very least we should avoid
112 optimizing loops in that user put his own prefetches)
113 -- we assume cache line size alignment of arrays; this could be
114 improved. */
115
116 /* Magic constants follow. These should be replaced by machine specific
117 numbers. */
118
119 /* True if write can be prefetched by a read prefetch. */
120
121 #ifndef WRITE_CAN_USE_READ_PREFETCH
122 #define WRITE_CAN_USE_READ_PREFETCH 1
123 #endif
124
125 /* True if read can be prefetched by a write prefetch. */
126
127 #ifndef READ_CAN_USE_WRITE_PREFETCH
128 #define READ_CAN_USE_WRITE_PREFETCH 0
129 #endif
130
131 /* The size of the block loaded by a single prefetch. Usually, this is
132 the same as cache line size (at the moment, we only consider one level
133 of cache hierarchy). */
134
135 #ifndef PREFETCH_BLOCK
136 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
137 #endif
138
139 /* Do we have a forward hardware sequential prefetching? */
140
141 #ifndef HAVE_FORWARD_PREFETCH
142 #define HAVE_FORWARD_PREFETCH 0
143 #endif
144
145 /* Do we have a backward hardware sequential prefetching? */
146
147 #ifndef HAVE_BACKWARD_PREFETCH
148 #define HAVE_BACKWARD_PREFETCH 0
149 #endif
150
151 /* In some cases we are only able to determine that there is a certain
152 probability that the two accesses hit the same cache line. In this
153 case, we issue the prefetches for both of them if this probability
154 is less then (1000 - ACCEPTABLE_MISS_RATE) promile. */
155
156 #ifndef ACCEPTABLE_MISS_RATE
157 #define ACCEPTABLE_MISS_RATE 50
158 #endif
159
160 #ifndef HAVE_prefetch
161 #define HAVE_prefetch 0
162 #endif
163
164 /* The group of references between that reuse may occur. */
165
166 struct mem_ref_group
167 {
168 tree base; /* Base of the reference. */
169 HOST_WIDE_INT step; /* Step of the reference. */
170 struct mem_ref *refs; /* References in the group. */
171 struct mem_ref_group *next; /* Next group of references. */
172 };
173
174 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
175
176 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
177
178 /* The memory reference. */
179
180 struct mem_ref
181 {
182 tree stmt; /* Statement in that the reference appears. */
183 tree mem; /* The reference. */
184 HOST_WIDE_INT delta; /* Constant offset of the reference. */
185 bool write_p; /* Is it a write? */
186 struct mem_ref_group *group; /* The group of references it belongs to. */
187 unsigned HOST_WIDE_INT prefetch_mod;
188 /* Prefetch only each PREFETCH_MOD-th
189 iteration. */
190 unsigned HOST_WIDE_INT prefetch_before;
191 /* Prefetch only first PREFETCH_BEFORE
192 iterations. */
193 bool issue_prefetch_p; /* Should we really issue the prefetch? */
194 struct mem_ref *next; /* The next reference in the group. */
195 };
196
197 /* Dumps information about reference REF to FILE. */
198
199 static void
200 dump_mem_ref (FILE *file, struct mem_ref *ref)
201 {
202 fprintf (file, "Reference %p:\n", (void *) ref);
203
204 fprintf (file, " group %p (base ", (void *) ref->group);
205 print_generic_expr (file, ref->group->base, TDF_SLIM);
206 fprintf (file, ", step ");
207 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
208 fprintf (file, ")\n");
209
210 fprintf (file, " delta ");
211 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
212 fprintf (file, "\n");
213
214 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
215
216 fprintf (file, "\n");
217 }
218
219 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
220 exist. */
221
222 static struct mem_ref_group *
223 find_or_create_group (struct mem_ref_group **groups, tree base,
224 HOST_WIDE_INT step)
225 {
226 struct mem_ref_group *group;
227
228 for (; *groups; groups = &(*groups)->next)
229 {
230 if ((*groups)->step == step
231 && operand_equal_p ((*groups)->base, base, 0))
232 return *groups;
233
234 /* Keep the list of groups sorted by decreasing step. */
235 if ((*groups)->step < step)
236 break;
237 }
238
239 group = xcalloc (1, sizeof (struct mem_ref_group));
240 group->base = base;
241 group->step = step;
242 group->refs = NULL;
243 group->next = *groups;
244 *groups = group;
245
246 return group;
247 }
248
249 /* Records a memory reference MEM in GROUP with offset DELTA and write status
250 WRITE_P. The reference occurs in statement STMT. */
251
252 static void
253 record_ref (struct mem_ref_group *group, tree stmt, tree mem,
254 HOST_WIDE_INT delta, bool write_p)
255 {
256 struct mem_ref **aref;
257
258 /* Do not record the same address twice. */
259 for (aref = &group->refs; *aref; aref = &(*aref)->next)
260 {
261 /* It does not have to be possible for write reference to reuse the read
262 prefetch, or vice versa. */
263 if (!WRITE_CAN_USE_READ_PREFETCH
264 && write_p
265 && !(*aref)->write_p)
266 continue;
267 if (!READ_CAN_USE_WRITE_PREFETCH
268 && !write_p
269 && (*aref)->write_p)
270 continue;
271
272 if ((*aref)->delta == delta)
273 return;
274 }
275
276 (*aref) = xcalloc (1, sizeof (struct mem_ref));
277 (*aref)->stmt = stmt;
278 (*aref)->mem = mem;
279 (*aref)->delta = delta;
280 (*aref)->write_p = write_p;
281 (*aref)->prefetch_before = PREFETCH_ALL;
282 (*aref)->prefetch_mod = 1;
283 (*aref)->issue_prefetch_p = false;
284 (*aref)->group = group;
285 (*aref)->next = NULL;
286
287 if (dump_file && (dump_flags & TDF_DETAILS))
288 dump_mem_ref (dump_file, *aref);
289 }
290
291 /* Release memory references in GROUPS. */
292
293 static void
294 release_mem_refs (struct mem_ref_group *groups)
295 {
296 struct mem_ref_group *next_g;
297 struct mem_ref *ref, *next_r;
298
299 for (; groups; groups = next_g)
300 {
301 next_g = groups->next;
302 for (ref = groups->refs; ref; ref = next_r)
303 {
304 next_r = ref->next;
305 free (ref);
306 }
307 free (groups);
308 }
309 }
310
311 /* A structure used to pass arguments to idx_analyze_ref. */
312
313 struct ar_data
314 {
315 struct loop *loop; /* Loop of the reference. */
316 tree stmt; /* Statement of the reference. */
317 HOST_WIDE_INT *step; /* Step of the memory reference. */
318 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
319 };
320
321 /* Analyzes a single INDEX of a memory reference to obtain information
322 described at analyze_ref. Callback for for_each_index. */
323
324 static bool
325 idx_analyze_ref (tree base, tree *index, void *data)
326 {
327 struct ar_data *ar_data = data;
328 tree ibase, step, stepsize;
329 HOST_WIDE_INT istep, idelta = 0, imult = 1;
330 affine_iv iv;
331
332 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
333 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
334 return false;
335
336 if (!simple_iv (ar_data->loop, ar_data->stmt, *index, &iv, false))
337 return false;
338 ibase = iv.base;
339 step = iv.step;
340
341 if (!cst_and_fits_in_hwi (step))
342 return false;
343 istep = int_cst_value (step);
344
345 if (TREE_CODE (ibase) == PLUS_EXPR
346 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
347 {
348 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
349 ibase = TREE_OPERAND (ibase, 0);
350 }
351 if (cst_and_fits_in_hwi (ibase))
352 {
353 idelta += int_cst_value (ibase);
354 ibase = build_int_cst (TREE_TYPE (ibase), 0);
355 }
356
357 if (TREE_CODE (base) == ARRAY_REF)
358 {
359 stepsize = array_ref_element_size (base);
360 if (!cst_and_fits_in_hwi (stepsize))
361 return false;
362 imult = int_cst_value (stepsize);
363
364 istep *= imult;
365 idelta *= imult;
366 }
367
368 *ar_data->step += istep;
369 *ar_data->delta += idelta;
370 *index = ibase;
371
372 return true;
373 }
374
375 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
376 STEP are integer constants and iter is number of iterations of LOOP. The
377 reference occurs in statement STMT. Strips nonaddressable component
378 references from REF_P. */
379
380 static bool
381 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
382 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
383 tree stmt)
384 {
385 struct ar_data ar_data;
386 tree off;
387 HOST_WIDE_INT bit_offset;
388 tree ref = *ref_p;
389
390 *step = 0;
391 *delta = 0;
392
393 /* First strip off the component references. Ignore bitfields. */
394 if (TREE_CODE (ref) == COMPONENT_REF
395 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
396 ref = TREE_OPERAND (ref, 0);
397
398 *ref_p = ref;
399
400 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
401 {
402 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
403 bit_offset = TREE_INT_CST_LOW (off);
404 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
405
406 *delta += bit_offset / BITS_PER_UNIT;
407 }
408
409 *base = unshare_expr (ref);
410 ar_data.loop = loop;
411 ar_data.stmt = stmt;
412 ar_data.step = step;
413 ar_data.delta = delta;
414 return for_each_index (base, idx_analyze_ref, &ar_data);
415 }
416
417 /* Record a memory reference REF to the list REFS. The reference occurs in
418 LOOP in statement STMT and it is write if WRITE_P. */
419
420 static void
421 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
422 tree ref, bool write_p, tree stmt)
423 {
424 tree base;
425 HOST_WIDE_INT step, delta;
426 struct mem_ref_group *agrp;
427
428 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
429 return;
430
431 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
432 are integer constants. */
433 agrp = find_or_create_group (refs, base, step);
434 record_ref (agrp, stmt, ref, delta, write_p);
435 }
436
437 /* Record the suitable memory references in LOOP. */
438
439 static struct mem_ref_group *
440 gather_memory_references (struct loop *loop)
441 {
442 basic_block *body = get_loop_body_in_dom_order (loop);
443 basic_block bb;
444 unsigned i;
445 block_stmt_iterator bsi;
446 tree stmt, lhs, rhs;
447 struct mem_ref_group *refs = NULL;
448
449 /* Scan the loop body in order, so that the former references precede the
450 later ones. */
451 for (i = 0; i < loop->num_nodes; i++)
452 {
453 bb = body[i];
454 if (bb->loop_father != loop)
455 continue;
456
457 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
458 {
459 stmt = bsi_stmt (bsi);
460 if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
461 continue;
462
463 lhs = GIMPLE_STMT_OPERAND (stmt, 0);
464 rhs = GIMPLE_STMT_OPERAND (stmt, 1);
465
466 if (REFERENCE_CLASS_P (rhs))
467 gather_memory_references_ref (loop, &refs, rhs, false, stmt);
468 if (REFERENCE_CLASS_P (lhs))
469 gather_memory_references_ref (loop, &refs, lhs, true, stmt);
470 }
471 }
472 free (body);
473
474 return refs;
475 }
476
477 /* Prune the prefetch candidate REF using the self-reuse. */
478
479 static void
480 prune_ref_by_self_reuse (struct mem_ref *ref)
481 {
482 HOST_WIDE_INT step = ref->group->step;
483 bool backward = step < 0;
484
485 if (step == 0)
486 {
487 /* Prefetch references to invariant address just once. */
488 ref->prefetch_before = 1;
489 return;
490 }
491
492 if (backward)
493 step = -step;
494
495 if (step > PREFETCH_BLOCK)
496 return;
497
498 if ((backward && HAVE_BACKWARD_PREFETCH)
499 || (!backward && HAVE_FORWARD_PREFETCH))
500 {
501 ref->prefetch_before = 1;
502 return;
503 }
504
505 ref->prefetch_mod = PREFETCH_BLOCK / step;
506 }
507
508 /* Divides X by BY, rounding down. */
509
510 static HOST_WIDE_INT
511 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
512 {
513 gcc_assert (by > 0);
514
515 if (x >= 0)
516 return x / by;
517 else
518 return (x + by - 1) / by;
519 }
520
521 /* Prune the prefetch candidate REF using the reuse with BY.
522 If BY_IS_BEFORE is true, BY is before REF in the loop. */
523
524 static void
525 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
526 bool by_is_before)
527 {
528 HOST_WIDE_INT step = ref->group->step;
529 bool backward = step < 0;
530 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
531 HOST_WIDE_INT delta = delta_b - delta_r;
532 HOST_WIDE_INT hit_from;
533 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
534
535 if (delta == 0)
536 {
537 /* If the references has the same address, only prefetch the
538 former. */
539 if (by_is_before)
540 ref->prefetch_before = 0;
541
542 return;
543 }
544
545 if (!step)
546 {
547 /* If the reference addresses are invariant and fall into the
548 same cache line, prefetch just the first one. */
549 if (!by_is_before)
550 return;
551
552 if (ddown (ref->delta, PREFETCH_BLOCK)
553 != ddown (by->delta, PREFETCH_BLOCK))
554 return;
555
556 ref->prefetch_before = 0;
557 return;
558 }
559
560 /* Only prune the reference that is behind in the array. */
561 if (backward)
562 {
563 if (delta > 0)
564 return;
565
566 /* Transform the data so that we may assume that the accesses
567 are forward. */
568 delta = - delta;
569 step = -step;
570 delta_r = PREFETCH_BLOCK - 1 - delta_r;
571 delta_b = PREFETCH_BLOCK - 1 - delta_b;
572 }
573 else
574 {
575 if (delta < 0)
576 return;
577 }
578
579 /* Check whether the two references are likely to hit the same cache
580 line, and how distant the iterations in that it occurs are from
581 each other. */
582
583 if (step <= PREFETCH_BLOCK)
584 {
585 /* The accesses are sure to meet. Let us check when. */
586 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
587 prefetch_before = (hit_from - delta_r + step - 1) / step;
588
589 if (prefetch_before < ref->prefetch_before)
590 ref->prefetch_before = prefetch_before;
591
592 return;
593 }
594
595 /* A more complicated case. First let us ensure that size of cache line
596 and step are coprime (here we assume that PREFETCH_BLOCK is a power
597 of two. */
598 prefetch_block = PREFETCH_BLOCK;
599 while ((step & 1) == 0
600 && prefetch_block > 1)
601 {
602 step >>= 1;
603 prefetch_block >>= 1;
604 delta >>= 1;
605 }
606
607 /* Now step > prefetch_block, and step and prefetch_block are coprime.
608 Determine the probability that the accesses hit the same cache line. */
609
610 prefetch_before = delta / step;
611 delta %= step;
612 if ((unsigned HOST_WIDE_INT) delta
613 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
614 {
615 if (prefetch_before < ref->prefetch_before)
616 ref->prefetch_before = prefetch_before;
617
618 return;
619 }
620
621 /* Try also the following iteration. */
622 prefetch_before++;
623 delta = step - delta;
624 if ((unsigned HOST_WIDE_INT) delta
625 <= (prefetch_block * ACCEPTABLE_MISS_RATE / 1000))
626 {
627 if (prefetch_before < ref->prefetch_before)
628 ref->prefetch_before = prefetch_before;
629
630 return;
631 }
632
633 /* The ref probably does not reuse by. */
634 return;
635 }
636
637 /* Prune the prefetch candidate REF using the reuses with other references
638 in REFS. */
639
640 static void
641 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
642 {
643 struct mem_ref *prune_by;
644 bool before = true;
645
646 prune_ref_by_self_reuse (ref);
647
648 for (prune_by = refs; prune_by; prune_by = prune_by->next)
649 {
650 if (prune_by == ref)
651 {
652 before = false;
653 continue;
654 }
655
656 if (!WRITE_CAN_USE_READ_PREFETCH
657 && ref->write_p
658 && !prune_by->write_p)
659 continue;
660 if (!READ_CAN_USE_WRITE_PREFETCH
661 && !ref->write_p
662 && prune_by->write_p)
663 continue;
664
665 prune_ref_by_group_reuse (ref, prune_by, before);
666 }
667 }
668
669 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
670
671 static void
672 prune_group_by_reuse (struct mem_ref_group *group)
673 {
674 struct mem_ref *ref_pruned;
675
676 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
677 {
678 prune_ref_by_reuse (ref_pruned, group->refs);
679
680 if (dump_file && (dump_flags & TDF_DETAILS))
681 {
682 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
683
684 if (ref_pruned->prefetch_before == PREFETCH_ALL
685 && ref_pruned->prefetch_mod == 1)
686 fprintf (dump_file, " no restrictions");
687 else if (ref_pruned->prefetch_before == 0)
688 fprintf (dump_file, " do not prefetch");
689 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
690 fprintf (dump_file, " prefetch once");
691 else
692 {
693 if (ref_pruned->prefetch_before != PREFETCH_ALL)
694 {
695 fprintf (dump_file, " prefetch before ");
696 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
697 ref_pruned->prefetch_before);
698 }
699 if (ref_pruned->prefetch_mod != 1)
700 {
701 fprintf (dump_file, " prefetch mod ");
702 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
703 ref_pruned->prefetch_mod);
704 }
705 }
706 fprintf (dump_file, "\n");
707 }
708 }
709 }
710
711 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
712
713 static void
714 prune_by_reuse (struct mem_ref_group *groups)
715 {
716 for (; groups; groups = groups->next)
717 prune_group_by_reuse (groups);
718 }
719
720 /* Returns true if we should issue prefetch for REF. */
721
722 static bool
723 should_issue_prefetch_p (struct mem_ref *ref)
724 {
725 /* For now do not issue prefetches for only first few of the
726 iterations. */
727 if (ref->prefetch_before != PREFETCH_ALL)
728 return false;
729
730 return true;
731 }
732
733 /* Decide which of the prefetch candidates in GROUPS to prefetch.
734 AHEAD is the number of iterations to prefetch ahead (which corresponds
735 to the number of simultaneous instances of one prefetch running at a
736 time). UNROLL_FACTOR is the factor by that the loop is going to be
737 unrolled. Returns true if there is anything to prefetch. */
738
739 static bool
740 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
741 unsigned ahead)
742 {
743 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
744 unsigned slots_per_prefetch;
745 struct mem_ref *ref;
746 bool any = false;
747
748 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
749 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
750
751 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
752 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
753 it will need a prefetch slot. */
754 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
755 if (dump_file && (dump_flags & TDF_DETAILS))
756 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
757 slots_per_prefetch);
758
759 /* For now we just take memory references one by one and issue
760 prefetches for as many as possible. The groups are sorted
761 starting with the largest step, since the references with
762 large step are more likely to cause many cache misses. */
763
764 for (; groups; groups = groups->next)
765 for (ref = groups->refs; ref; ref = ref->next)
766 {
767 if (!should_issue_prefetch_p (ref))
768 continue;
769
770 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
771 and we unroll the loop UNROLL_FACTOR times, we need to insert
772 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
773 iteration. */
774 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
775 / ref->prefetch_mod);
776 prefetch_slots = n_prefetches * slots_per_prefetch;
777
778 /* If more than half of the prefetches would be lost anyway, do not
779 issue the prefetch. */
780 if (2 * remaining_prefetch_slots < prefetch_slots)
781 continue;
782
783 ref->issue_prefetch_p = true;
784
785 if (remaining_prefetch_slots <= prefetch_slots)
786 return true;
787 remaining_prefetch_slots -= prefetch_slots;
788 any = true;
789 }
790
791 return any;
792 }
793
794 /* Determine whether there is any reference suitable for prefetching
795 in GROUPS. */
796
797 static bool
798 anything_to_prefetch_p (struct mem_ref_group *groups)
799 {
800 struct mem_ref *ref;
801
802 for (; groups; groups = groups->next)
803 for (ref = groups->refs; ref; ref = ref->next)
804 if (should_issue_prefetch_p (ref))
805 return true;
806
807 return false;
808 }
809
810 /* Issue prefetches for the reference REF into loop as decided before.
811 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
812 is the factor by which LOOP was unrolled. */
813
814 static void
815 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
816 {
817 HOST_WIDE_INT delta;
818 tree addr, addr_base, prefetch, write_p;
819 block_stmt_iterator bsi;
820 unsigned n_prefetches, ap;
821
822 if (dump_file && (dump_flags & TDF_DETAILS))
823 fprintf (dump_file, "Issued prefetch for %p.\n", (void *) ref);
824
825 bsi = bsi_for_stmt (ref->stmt);
826
827 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
828 / ref->prefetch_mod);
829 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
830 addr_base = force_gimple_operand_bsi (&bsi, unshare_expr (addr_base), true, NULL);
831 write_p = ref->write_p ? integer_one_node : integer_zero_node;
832
833 for (ap = 0; ap < n_prefetches; ap++)
834 {
835 /* Determine the address to prefetch. */
836 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
837 addr = fold_build2 (PLUS_EXPR, ptr_type_node,
838 addr_base, build_int_cst (ptr_type_node, delta));
839 addr = force_gimple_operand_bsi (&bsi, unshare_expr (addr), true, NULL);
840
841 /* Create the prefetch instruction. */
842 prefetch = build_call_expr (built_in_decls[BUILT_IN_PREFETCH],
843 2, addr, write_p);
844 bsi_insert_before (&bsi, prefetch, BSI_SAME_STMT);
845 }
846 }
847
848 /* Issue prefetches for the references in GROUPS into loop as decided before.
849 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
850 factor by that LOOP was unrolled. */
851
852 static void
853 issue_prefetches (struct mem_ref_group *groups,
854 unsigned unroll_factor, unsigned ahead)
855 {
856 struct mem_ref *ref;
857
858 for (; groups; groups = groups->next)
859 for (ref = groups->refs; ref; ref = ref->next)
860 if (ref->issue_prefetch_p)
861 issue_prefetch_ref (ref, unroll_factor, ahead);
862 }
863
864 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
865 this is the case, fill in DESC by the description of number of
866 iterations. */
867
868 static bool
869 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
870 unsigned factor)
871 {
872 if (!can_unroll_loop_p (loop, factor, desc))
873 return false;
874
875 /* We only consider loops without control flow for unrolling. This is not
876 a hard restriction -- tree_unroll_loop works with arbitrary loops
877 as well; but the unrolling/prefetching is usually more profitable for
878 loops consisting of a single basic block, and we want to limit the
879 code growth. */
880 if (loop->num_nodes > 2)
881 return false;
882
883 return true;
884 }
885
886 /* Determine the coefficient by that unroll LOOP, from the information
887 contained in the list of memory references REFS. Description of
888 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
889 insns of the LOOP. EST_NITER is the estimated number of iterations of
890 the loop, or -1 if no estimate is available. */
891
892 static unsigned
893 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
894 unsigned ninsns, struct tree_niter_desc *desc,
895 HOST_WIDE_INT est_niter)
896 {
897 unsigned upper_bound;
898 unsigned nfactor, factor, mod_constraint;
899 struct mem_ref_group *agp;
900 struct mem_ref *ref;
901
902 /* First check whether the loop is not too large to unroll. We ignore
903 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
904 from unrolling them enough to make exactly one cache line covered by each
905 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
906 us from unrolling the loops too many times in cases where we only expect
907 gains from better scheduling and decreasing loop overhead, which is not
908 the case here. */
909 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
910
911 /* If we unrolled the loop more times than it iterates, the unrolled version
912 of the loop would be never entered. */
913 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
914 upper_bound = est_niter;
915
916 if (upper_bound <= 1)
917 return 1;
918
919 /* Choose the factor so that we may prefetch each cache just once,
920 but bound the unrolling by UPPER_BOUND. */
921 factor = 1;
922 for (agp = refs; agp; agp = agp->next)
923 for (ref = agp->refs; ref; ref = ref->next)
924 if (should_issue_prefetch_p (ref))
925 {
926 mod_constraint = ref->prefetch_mod;
927 nfactor = least_common_multiple (mod_constraint, factor);
928 if (nfactor <= upper_bound)
929 factor = nfactor;
930 }
931
932 if (!should_unroll_loop_p (loop, desc, factor))
933 return 1;
934
935 return factor;
936 }
937
938 /* Issue prefetch instructions for array references in LOOP. Returns
939 true if the LOOP was unrolled. */
940
941 static bool
942 loop_prefetch_arrays (struct loop *loop)
943 {
944 struct mem_ref_group *refs;
945 unsigned ahead, ninsns, time, unroll_factor;
946 HOST_WIDE_INT est_niter;
947 struct tree_niter_desc desc;
948 bool unrolled = false;
949
950 /* Step 1: gather the memory references. */
951 refs = gather_memory_references (loop);
952
953 /* Step 2: estimate the reuse effects. */
954 prune_by_reuse (refs);
955
956 if (!anything_to_prefetch_p (refs))
957 goto fail;
958
959 /* Step 3: determine the ahead and unroll factor. */
960
961 /* FIXME: the time should be weighted by the probabilities of the blocks in
962 the loop body. */
963 time = tree_num_loop_insns (loop, &eni_time_weights);
964 ahead = (PREFETCH_LATENCY + time - 1) / time;
965 est_niter = estimated_loop_iterations_int (loop, false);
966
967 /* The prefetches will run for AHEAD iterations of the original loop. Unless
968 the loop rolls at least AHEAD times, prefetching the references does not
969 make sense. */
970 if (est_niter >= 0 && est_niter <= (HOST_WIDE_INT) ahead)
971 goto fail;
972
973 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
974 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
975 est_niter);
976 if (dump_file && (dump_flags & TDF_DETAILS))
977 fprintf (dump_file, "Ahead %d, unroll factor %d\n", ahead, unroll_factor);
978
979 /* Step 4: what to prefetch? */
980 if (!schedule_prefetches (refs, unroll_factor, ahead))
981 goto fail;
982
983 /* Step 5: unroll the loop. TODO -- peeling of first and last few
984 iterations so that we do not issue superfluous prefetches. */
985 if (unroll_factor != 1)
986 {
987 tree_unroll_loop (loop, unroll_factor,
988 single_dom_exit (loop), &desc);
989 unrolled = true;
990 }
991
992 /* Step 6: issue the prefetches. */
993 issue_prefetches (refs, unroll_factor, ahead);
994
995 fail:
996 release_mem_refs (refs);
997 return unrolled;
998 }
999
1000 /* Issue prefetch instructions for array references in loops. */
1001
1002 unsigned int
1003 tree_ssa_prefetch_arrays (void)
1004 {
1005 loop_iterator li;
1006 struct loop *loop;
1007 bool unrolled = false;
1008 int todo_flags = 0;
1009
1010 if (!HAVE_prefetch
1011 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1012 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1013 of processor costs and i486 does not have prefetch, but
1014 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1015 || PREFETCH_BLOCK == 0)
1016 return 0;
1017
1018 if (dump_file && (dump_flags & TDF_DETAILS))
1019 {
1020 fprintf (dump_file, "Prefetching parameters:\n");
1021 fprintf (dump_file, " simultaneous prefetches: %d\n",
1022 SIMULTANEOUS_PREFETCHES);
1023 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1024 fprintf (dump_file, " L1 cache size: %d (%d bytes)\n",
1025 L1_CACHE_SIZE, L1_CACHE_SIZE * L1_CACHE_LINE_SIZE);
1026 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1027 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1028 fprintf (dump_file, "\n");
1029 }
1030
1031 initialize_original_copy_tables ();
1032
1033 if (!built_in_decls[BUILT_IN_PREFETCH])
1034 {
1035 tree type = build_function_type (void_type_node,
1036 tree_cons (NULL_TREE,
1037 const_ptr_type_node,
1038 NULL_TREE));
1039 tree decl = add_builtin_function ("__builtin_prefetch", type,
1040 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1041 NULL, NULL_TREE);
1042 DECL_IS_NOVOPS (decl) = true;
1043 built_in_decls[BUILT_IN_PREFETCH] = decl;
1044 }
1045
1046 /* We assume that size of cache line is a power of two, so verify this
1047 here. */
1048 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1049
1050 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1051 {
1052 if (dump_file && (dump_flags & TDF_DETAILS))
1053 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1054
1055 unrolled |= loop_prefetch_arrays (loop);
1056
1057 if (dump_file && (dump_flags & TDF_DETAILS))
1058 fprintf (dump_file, "\n\n");
1059 }
1060
1061 if (unrolled)
1062 {
1063 scev_reset ();
1064 todo_flags |= TODO_cleanup_cfg;
1065 }
1066
1067 free_original_copy_tables ();
1068 return todo_flags;
1069 }