Account for loop unrolling in the insn-to-prefetch ratio heuristic.
[gcc.git] / gcc / tree-ssa-loop-prefetch.c
1 /* Array prefetching.
2 Copyright (C) 2005, 2007, 2008 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "rtl.h"
26 #include "tm_p.h"
27 #include "hard-reg-set.h"
28 #include "basic-block.h"
29 #include "output.h"
30 #include "diagnostic.h"
31 #include "tree-flow.h"
32 #include "tree-dump.h"
33 #include "timevar.h"
34 #include "cfgloop.h"
35 #include "expr.h"
36 #include "tree-pass.h"
37 #include "ggc.h"
38 #include "insn-config.h"
39 #include "recog.h"
40 #include "hashtab.h"
41 #include "tree-chrec.h"
42 #include "tree-scalar-evolution.h"
43 #include "toplev.h"
44 #include "params.h"
45 #include "langhooks.h"
46 #include "tree-inline.h"
47 #include "tree-data-ref.h"
48 #include "optabs.h"
49
50 /* This pass inserts prefetch instructions to optimize cache usage during
51 accesses to arrays in loops. It processes loops sequentially and:
52
53 1) Gathers all memory references in the single loop.
54 2) For each of the references it decides when it is profitable to prefetch
55 it. To do it, we evaluate the reuse among the accesses, and determines
56 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
57 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
58 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
59 iterations of the loop that are zero modulo PREFETCH_MOD). For example
60 (assuming cache line size is 64 bytes, char has size 1 byte and there
61 is no hardware sequential prefetch):
62
63 char *a;
64 for (i = 0; i < max; i++)
65 {
66 a[255] = ...; (0)
67 a[i] = ...; (1)
68 a[i + 64] = ...; (2)
69 a[16*i] = ...; (3)
70 a[187*i] = ...; (4)
71 a[187*i + 50] = ...; (5)
72 }
73
74 (0) obviously has PREFETCH_BEFORE 1
75 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
76 location 64 iterations before it, and PREFETCH_MOD 64 (since
77 it hits the same cache line otherwise).
78 (2) has PREFETCH_MOD 64
79 (3) has PREFETCH_MOD 4
80 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
81 the cache line accessed by (4) is the same with probability only
82 7/32.
83 (5) has PREFETCH_MOD 1 as well.
84
85 Additionally, we use data dependence analysis to determine for each
86 reference the distance till the first reuse; this information is used
87 to determine the temporality of the issued prefetch instruction.
88
89 3) We determine how much ahead we need to prefetch. The number of
90 iterations needed is time to fetch / time spent in one iteration of
91 the loop. The problem is that we do not know either of these values,
92 so we just make a heuristic guess based on a magic (possibly)
93 target-specific constant and size of the loop.
94
95 4) Determine which of the references we prefetch. We take into account
96 that there is a maximum number of simultaneous prefetches (provided
97 by machine description). We prefetch as many prefetches as possible
98 while still within this bound (starting with those with lowest
99 prefetch_mod, since they are responsible for most of the cache
100 misses).
101
102 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
103 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
104 prefetching nonaccessed memory.
105 TODO -- actually implement peeling.
106
107 6) We actually emit the prefetch instructions. ??? Perhaps emit the
108 prefetch instructions with guards in cases where 5) was not sufficient
109 to satisfy the constraints?
110
111 The function is_loop_prefetching_profitable() implements a cost model
112 to determine if prefetching is profitable for a given loop. The cost
113 model has two heuristcs:
114 1. A heuristic that determines whether the given loop has enough CPU
115 ops that can be overlapped with cache missing memory ops.
116 If not, the loop won't benefit from prefetching. This is implemented
117 by requirung the ratio between the instruction count and the mem ref
118 count to be above a certain minimum.
119 2. A heuristic that disables prefetching in a loop with an unknown trip
120 count if the prefetching cost is above a certain limit. The relative
121 prefetching cost is estimated by taking the ratio between the
122 prefetch count and the total intruction count (this models the I-cache
123 cost).
124 The limits used in these heuristics are defined as parameters with
125 reasonable default values. Machine-specific default values will be
126 added later.
127
128 Some other TODO:
129 -- write and use more general reuse analysis (that could be also used
130 in other cache aimed loop optimizations)
131 -- make it behave sanely together with the prefetches given by user
132 (now we just ignore them; at the very least we should avoid
133 optimizing loops in that user put his own prefetches)
134 -- we assume cache line size alignment of arrays; this could be
135 improved. */
136
137 /* Magic constants follow. These should be replaced by machine specific
138 numbers. */
139
140 /* True if write can be prefetched by a read prefetch. */
141
142 #ifndef WRITE_CAN_USE_READ_PREFETCH
143 #define WRITE_CAN_USE_READ_PREFETCH 1
144 #endif
145
146 /* True if read can be prefetched by a write prefetch. */
147
148 #ifndef READ_CAN_USE_WRITE_PREFETCH
149 #define READ_CAN_USE_WRITE_PREFETCH 0
150 #endif
151
152 /* The size of the block loaded by a single prefetch. Usually, this is
153 the same as cache line size (at the moment, we only consider one level
154 of cache hierarchy). */
155
156 #ifndef PREFETCH_BLOCK
157 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
158 #endif
159
160 /* Do we have a forward hardware sequential prefetching? */
161
162 #ifndef HAVE_FORWARD_PREFETCH
163 #define HAVE_FORWARD_PREFETCH 0
164 #endif
165
166 /* Do we have a backward hardware sequential prefetching? */
167
168 #ifndef HAVE_BACKWARD_PREFETCH
169 #define HAVE_BACKWARD_PREFETCH 0
170 #endif
171
172 /* In some cases we are only able to determine that there is a certain
173 probability that the two accesses hit the same cache line. In this
174 case, we issue the prefetches for both of them if this probability
175 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
176
177 #ifndef ACCEPTABLE_MISS_RATE
178 #define ACCEPTABLE_MISS_RATE 50
179 #endif
180
181 #ifndef HAVE_prefetch
182 #define HAVE_prefetch 0
183 #endif
184
185 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
186 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
187
188 /* We consider a memory access nontemporal if it is not reused sooner than
189 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
190 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
191 so that we use nontemporal prefetches e.g. if single memory location
192 is accessed several times in a single iteration of the loop. */
193 #define NONTEMPORAL_FRACTION 16
194
195 /* In case we have to emit a memory fence instruction after the loop that
196 uses nontemporal stores, this defines the builtin to use. */
197
198 #ifndef FENCE_FOLLOWING_MOVNT
199 #define FENCE_FOLLOWING_MOVNT NULL_TREE
200 #endif
201
202 /* The group of references between that reuse may occur. */
203
204 struct mem_ref_group
205 {
206 tree base; /* Base of the reference. */
207 HOST_WIDE_INT step; /* Step of the reference. */
208 struct mem_ref *refs; /* References in the group. */
209 struct mem_ref_group *next; /* Next group of references. */
210 };
211
212 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
213
214 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
215
216 /* The memory reference. */
217
218 struct mem_ref
219 {
220 gimple stmt; /* Statement in that the reference appears. */
221 tree mem; /* The reference. */
222 HOST_WIDE_INT delta; /* Constant offset of the reference. */
223 struct mem_ref_group *group; /* The group of references it belongs to. */
224 unsigned HOST_WIDE_INT prefetch_mod;
225 /* Prefetch only each PREFETCH_MOD-th
226 iteration. */
227 unsigned HOST_WIDE_INT prefetch_before;
228 /* Prefetch only first PREFETCH_BEFORE
229 iterations. */
230 unsigned reuse_distance; /* The amount of data accessed before the first
231 reuse of this value. */
232 struct mem_ref *next; /* The next reference in the group. */
233 unsigned write_p : 1; /* Is it a write? */
234 unsigned independent_p : 1; /* True if the reference is independent on
235 all other references inside the loop. */
236 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
237 unsigned storent_p : 1; /* True if we changed the store to a
238 nontemporal one. */
239 };
240
241 /* Dumps information about reference REF to FILE. */
242
243 static void
244 dump_mem_ref (FILE *file, struct mem_ref *ref)
245 {
246 fprintf (file, "Reference %p:\n", (void *) ref);
247
248 fprintf (file, " group %p (base ", (void *) ref->group);
249 print_generic_expr (file, ref->group->base, TDF_SLIM);
250 fprintf (file, ", step ");
251 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->group->step);
252 fprintf (file, ")\n");
253
254 fprintf (file, " delta ");
255 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
256 fprintf (file, "\n");
257
258 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
259
260 fprintf (file, "\n");
261 }
262
263 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
264 exist. */
265
266 static struct mem_ref_group *
267 find_or_create_group (struct mem_ref_group **groups, tree base,
268 HOST_WIDE_INT step)
269 {
270 struct mem_ref_group *group;
271
272 for (; *groups; groups = &(*groups)->next)
273 {
274 if ((*groups)->step == step
275 && operand_equal_p ((*groups)->base, base, 0))
276 return *groups;
277
278 /* Keep the list of groups sorted by decreasing step. */
279 if ((*groups)->step < step)
280 break;
281 }
282
283 group = XNEW (struct mem_ref_group);
284 group->base = base;
285 group->step = step;
286 group->refs = NULL;
287 group->next = *groups;
288 *groups = group;
289
290 return group;
291 }
292
293 /* Records a memory reference MEM in GROUP with offset DELTA and write status
294 WRITE_P. The reference occurs in statement STMT. */
295
296 static void
297 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
298 HOST_WIDE_INT delta, bool write_p)
299 {
300 struct mem_ref **aref;
301
302 /* Do not record the same address twice. */
303 for (aref = &group->refs; *aref; aref = &(*aref)->next)
304 {
305 /* It does not have to be possible for write reference to reuse the read
306 prefetch, or vice versa. */
307 if (!WRITE_CAN_USE_READ_PREFETCH
308 && write_p
309 && !(*aref)->write_p)
310 continue;
311 if (!READ_CAN_USE_WRITE_PREFETCH
312 && !write_p
313 && (*aref)->write_p)
314 continue;
315
316 if ((*aref)->delta == delta)
317 return;
318 }
319
320 (*aref) = XNEW (struct mem_ref);
321 (*aref)->stmt = stmt;
322 (*aref)->mem = mem;
323 (*aref)->delta = delta;
324 (*aref)->write_p = write_p;
325 (*aref)->prefetch_before = PREFETCH_ALL;
326 (*aref)->prefetch_mod = 1;
327 (*aref)->reuse_distance = 0;
328 (*aref)->issue_prefetch_p = false;
329 (*aref)->group = group;
330 (*aref)->next = NULL;
331 (*aref)->independent_p = false;
332 (*aref)->storent_p = false;
333
334 if (dump_file && (dump_flags & TDF_DETAILS))
335 dump_mem_ref (dump_file, *aref);
336 }
337
338 /* Release memory references in GROUPS. */
339
340 static void
341 release_mem_refs (struct mem_ref_group *groups)
342 {
343 struct mem_ref_group *next_g;
344 struct mem_ref *ref, *next_r;
345
346 for (; groups; groups = next_g)
347 {
348 next_g = groups->next;
349 for (ref = groups->refs; ref; ref = next_r)
350 {
351 next_r = ref->next;
352 free (ref);
353 }
354 free (groups);
355 }
356 }
357
358 /* A structure used to pass arguments to idx_analyze_ref. */
359
360 struct ar_data
361 {
362 struct loop *loop; /* Loop of the reference. */
363 gimple stmt; /* Statement of the reference. */
364 HOST_WIDE_INT *step; /* Step of the memory reference. */
365 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
366 };
367
368 /* Analyzes a single INDEX of a memory reference to obtain information
369 described at analyze_ref. Callback for for_each_index. */
370
371 static bool
372 idx_analyze_ref (tree base, tree *index, void *data)
373 {
374 struct ar_data *ar_data = (struct ar_data *) data;
375 tree ibase, step, stepsize;
376 HOST_WIDE_INT istep, idelta = 0, imult = 1;
377 affine_iv iv;
378
379 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
380 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
381 return false;
382
383 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
384 *index, &iv, false))
385 return false;
386 ibase = iv.base;
387 step = iv.step;
388
389 if (!cst_and_fits_in_hwi (step))
390 return false;
391 istep = int_cst_value (step);
392
393 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
394 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
395 {
396 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
397 ibase = TREE_OPERAND (ibase, 0);
398 }
399 if (cst_and_fits_in_hwi (ibase))
400 {
401 idelta += int_cst_value (ibase);
402 ibase = build_int_cst (TREE_TYPE (ibase), 0);
403 }
404
405 if (TREE_CODE (base) == ARRAY_REF)
406 {
407 stepsize = array_ref_element_size (base);
408 if (!cst_and_fits_in_hwi (stepsize))
409 return false;
410 imult = int_cst_value (stepsize);
411
412 istep *= imult;
413 idelta *= imult;
414 }
415
416 *ar_data->step += istep;
417 *ar_data->delta += idelta;
418 *index = ibase;
419
420 return true;
421 }
422
423 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
424 STEP are integer constants and iter is number of iterations of LOOP. The
425 reference occurs in statement STMT. Strips nonaddressable component
426 references from REF_P. */
427
428 static bool
429 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
430 HOST_WIDE_INT *step, HOST_WIDE_INT *delta,
431 gimple stmt)
432 {
433 struct ar_data ar_data;
434 tree off;
435 HOST_WIDE_INT bit_offset;
436 tree ref = *ref_p;
437
438 *step = 0;
439 *delta = 0;
440
441 /* First strip off the component references. Ignore bitfields. */
442 if (TREE_CODE (ref) == COMPONENT_REF
443 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
444 ref = TREE_OPERAND (ref, 0);
445
446 *ref_p = ref;
447
448 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
449 {
450 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
451 bit_offset = TREE_INT_CST_LOW (off);
452 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
453
454 *delta += bit_offset / BITS_PER_UNIT;
455 }
456
457 *base = unshare_expr (ref);
458 ar_data.loop = loop;
459 ar_data.stmt = stmt;
460 ar_data.step = step;
461 ar_data.delta = delta;
462 return for_each_index (base, idx_analyze_ref, &ar_data);
463 }
464
465 /* Record a memory reference REF to the list REFS. The reference occurs in
466 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
467 reference was recorded, false otherwise. */
468
469 static bool
470 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
471 tree ref, bool write_p, gimple stmt)
472 {
473 tree base;
474 HOST_WIDE_INT step, delta;
475 struct mem_ref_group *agrp;
476
477 if (get_base_address (ref) == NULL)
478 return false;
479
480 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
481 return false;
482
483 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
484 are integer constants. */
485 agrp = find_or_create_group (refs, base, step);
486 record_ref (agrp, stmt, ref, delta, write_p);
487
488 return true;
489 }
490
491 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
492 true if there are no other memory references inside the loop. */
493
494 static struct mem_ref_group *
495 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
496 {
497 basic_block *body = get_loop_body_in_dom_order (loop);
498 basic_block bb;
499 unsigned i;
500 gimple_stmt_iterator bsi;
501 gimple stmt;
502 tree lhs, rhs;
503 struct mem_ref_group *refs = NULL;
504
505 *no_other_refs = true;
506 *ref_count = 0;
507
508 /* Scan the loop body in order, so that the former references precede the
509 later ones. */
510 for (i = 0; i < loop->num_nodes; i++)
511 {
512 bb = body[i];
513 if (bb->loop_father != loop)
514 continue;
515
516 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
517 {
518 stmt = gsi_stmt (bsi);
519
520 if (gimple_code (stmt) != GIMPLE_ASSIGN)
521 {
522 if (gimple_vuse (stmt)
523 || (is_gimple_call (stmt)
524 && !(gimple_call_flags (stmt) & ECF_CONST)))
525 *no_other_refs = false;
526 continue;
527 }
528
529 lhs = gimple_assign_lhs (stmt);
530 rhs = gimple_assign_rhs1 (stmt);
531
532 if (REFERENCE_CLASS_P (rhs))
533 {
534 *no_other_refs &= gather_memory_references_ref (loop, &refs,
535 rhs, false, stmt);
536 *ref_count += 1;
537 }
538 if (REFERENCE_CLASS_P (lhs))
539 {
540 *no_other_refs &= gather_memory_references_ref (loop, &refs,
541 lhs, true, stmt);
542 *ref_count += 1;
543 }
544 }
545 }
546 free (body);
547
548 return refs;
549 }
550
551 /* Prune the prefetch candidate REF using the self-reuse. */
552
553 static void
554 prune_ref_by_self_reuse (struct mem_ref *ref)
555 {
556 HOST_WIDE_INT step = ref->group->step;
557 bool backward = step < 0;
558
559 if (step == 0)
560 {
561 /* Prefetch references to invariant address just once. */
562 ref->prefetch_before = 1;
563 return;
564 }
565
566 if (backward)
567 step = -step;
568
569 if (step > PREFETCH_BLOCK)
570 return;
571
572 if ((backward && HAVE_BACKWARD_PREFETCH)
573 || (!backward && HAVE_FORWARD_PREFETCH))
574 {
575 ref->prefetch_before = 1;
576 return;
577 }
578
579 ref->prefetch_mod = PREFETCH_BLOCK / step;
580 }
581
582 /* Divides X by BY, rounding down. */
583
584 static HOST_WIDE_INT
585 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
586 {
587 gcc_assert (by > 0);
588
589 if (x >= 0)
590 return x / by;
591 else
592 return (x + by - 1) / by;
593 }
594
595 /* Given a CACHE_LINE_SIZE and two inductive memory references
596 with a common STEP greater than CACHE_LINE_SIZE and an address
597 difference DELTA, compute the probability that they will fall
598 in different cache lines. DISTINCT_ITERS is the number of
599 distinct iterations after which the pattern repeats itself.
600 ALIGN_UNIT is the unit of alignment in bytes. */
601
602 static int
603 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size,
604 HOST_WIDE_INT step, HOST_WIDE_INT delta,
605 unsigned HOST_WIDE_INT distinct_iters,
606 int align_unit)
607 {
608 unsigned align, iter;
609 int total_positions, miss_positions, miss_rate;
610 int address1, address2, cache_line1, cache_line2;
611
612 total_positions = 0;
613 miss_positions = 0;
614
615 /* Iterate through all possible alignments of the first
616 memory reference within its cache line. */
617 for (align = 0; align < cache_line_size; align += align_unit)
618
619 /* Iterate through all distinct iterations. */
620 for (iter = 0; iter < distinct_iters; iter++)
621 {
622 address1 = align + step * iter;
623 address2 = address1 + delta;
624 cache_line1 = address1 / cache_line_size;
625 cache_line2 = address2 / cache_line_size;
626 total_positions += 1;
627 if (cache_line1 != cache_line2)
628 miss_positions += 1;
629 }
630 miss_rate = 1000 * miss_positions / total_positions;
631 return miss_rate;
632 }
633
634 /* Prune the prefetch candidate REF using the reuse with BY.
635 If BY_IS_BEFORE is true, BY is before REF in the loop. */
636
637 static void
638 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
639 bool by_is_before)
640 {
641 HOST_WIDE_INT step = ref->group->step;
642 bool backward = step < 0;
643 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
644 HOST_WIDE_INT delta = delta_b - delta_r;
645 HOST_WIDE_INT hit_from;
646 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
647 int miss_rate;
648 HOST_WIDE_INT reduced_step;
649 unsigned HOST_WIDE_INT reduced_prefetch_block;
650 tree ref_type;
651 int align_unit;
652
653 if (delta == 0)
654 {
655 /* If the references has the same address, only prefetch the
656 former. */
657 if (by_is_before)
658 ref->prefetch_before = 0;
659
660 return;
661 }
662
663 if (!step)
664 {
665 /* If the reference addresses are invariant and fall into the
666 same cache line, prefetch just the first one. */
667 if (!by_is_before)
668 return;
669
670 if (ddown (ref->delta, PREFETCH_BLOCK)
671 != ddown (by->delta, PREFETCH_BLOCK))
672 return;
673
674 ref->prefetch_before = 0;
675 return;
676 }
677
678 /* Only prune the reference that is behind in the array. */
679 if (backward)
680 {
681 if (delta > 0)
682 return;
683
684 /* Transform the data so that we may assume that the accesses
685 are forward. */
686 delta = - delta;
687 step = -step;
688 delta_r = PREFETCH_BLOCK - 1 - delta_r;
689 delta_b = PREFETCH_BLOCK - 1 - delta_b;
690 }
691 else
692 {
693 if (delta < 0)
694 return;
695 }
696
697 /* Check whether the two references are likely to hit the same cache
698 line, and how distant the iterations in that it occurs are from
699 each other. */
700
701 if (step <= PREFETCH_BLOCK)
702 {
703 /* The accesses are sure to meet. Let us check when. */
704 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
705 prefetch_before = (hit_from - delta_r + step - 1) / step;
706
707 if (prefetch_before < ref->prefetch_before)
708 ref->prefetch_before = prefetch_before;
709
710 return;
711 }
712
713 /* A more complicated case with step > prefetch_block. First reduce
714 the ratio between the step and the cache line size to its simplest
715 terms. The resulting denominator will then represent the number of
716 distinct iterations after which each address will go back to its
717 initial location within the cache line. This computation assumes
718 that PREFETCH_BLOCK is a power of two. */
719 prefetch_block = PREFETCH_BLOCK;
720 reduced_prefetch_block = prefetch_block;
721 reduced_step = step;
722 while ((reduced_step & 1) == 0
723 && reduced_prefetch_block > 1)
724 {
725 reduced_step >>= 1;
726 reduced_prefetch_block >>= 1;
727 }
728
729 prefetch_before = delta / step;
730 delta %= step;
731 ref_type = TREE_TYPE (ref->mem);
732 align_unit = TYPE_ALIGN (ref_type) / 8;
733 miss_rate = compute_miss_rate(prefetch_block, step, delta,
734 reduced_prefetch_block, align_unit);
735 if (miss_rate <= ACCEPTABLE_MISS_RATE)
736 {
737 if (prefetch_before < ref->prefetch_before)
738 ref->prefetch_before = prefetch_before;
739
740 return;
741 }
742
743 /* Try also the following iteration. */
744 prefetch_before++;
745 delta = step - delta;
746 miss_rate = compute_miss_rate(prefetch_block, step, delta,
747 reduced_prefetch_block, align_unit);
748 if (miss_rate <= ACCEPTABLE_MISS_RATE)
749 {
750 if (prefetch_before < ref->prefetch_before)
751 ref->prefetch_before = prefetch_before;
752
753 return;
754 }
755
756 /* The ref probably does not reuse by. */
757 return;
758 }
759
760 /* Prune the prefetch candidate REF using the reuses with other references
761 in REFS. */
762
763 static void
764 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
765 {
766 struct mem_ref *prune_by;
767 bool before = true;
768
769 prune_ref_by_self_reuse (ref);
770
771 for (prune_by = refs; prune_by; prune_by = prune_by->next)
772 {
773 if (prune_by == ref)
774 {
775 before = false;
776 continue;
777 }
778
779 if (!WRITE_CAN_USE_READ_PREFETCH
780 && ref->write_p
781 && !prune_by->write_p)
782 continue;
783 if (!READ_CAN_USE_WRITE_PREFETCH
784 && !ref->write_p
785 && prune_by->write_p)
786 continue;
787
788 prune_ref_by_group_reuse (ref, prune_by, before);
789 }
790 }
791
792 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
793
794 static void
795 prune_group_by_reuse (struct mem_ref_group *group)
796 {
797 struct mem_ref *ref_pruned;
798
799 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
800 {
801 prune_ref_by_reuse (ref_pruned, group->refs);
802
803 if (dump_file && (dump_flags & TDF_DETAILS))
804 {
805 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
806
807 if (ref_pruned->prefetch_before == PREFETCH_ALL
808 && ref_pruned->prefetch_mod == 1)
809 fprintf (dump_file, " no restrictions");
810 else if (ref_pruned->prefetch_before == 0)
811 fprintf (dump_file, " do not prefetch");
812 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
813 fprintf (dump_file, " prefetch once");
814 else
815 {
816 if (ref_pruned->prefetch_before != PREFETCH_ALL)
817 {
818 fprintf (dump_file, " prefetch before ");
819 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
820 ref_pruned->prefetch_before);
821 }
822 if (ref_pruned->prefetch_mod != 1)
823 {
824 fprintf (dump_file, " prefetch mod ");
825 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
826 ref_pruned->prefetch_mod);
827 }
828 }
829 fprintf (dump_file, "\n");
830 }
831 }
832 }
833
834 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
835
836 static void
837 prune_by_reuse (struct mem_ref_group *groups)
838 {
839 for (; groups; groups = groups->next)
840 prune_group_by_reuse (groups);
841 }
842
843 /* Returns true if we should issue prefetch for REF. */
844
845 static bool
846 should_issue_prefetch_p (struct mem_ref *ref)
847 {
848 /* For now do not issue prefetches for only first few of the
849 iterations. */
850 if (ref->prefetch_before != PREFETCH_ALL)
851 return false;
852
853 /* Do not prefetch nontemporal stores. */
854 if (ref->storent_p)
855 return false;
856
857 return true;
858 }
859
860 /* Decide which of the prefetch candidates in GROUPS to prefetch.
861 AHEAD is the number of iterations to prefetch ahead (which corresponds
862 to the number of simultaneous instances of one prefetch running at a
863 time). UNROLL_FACTOR is the factor by that the loop is going to be
864 unrolled. Returns true if there is anything to prefetch. */
865
866 static bool
867 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
868 unsigned ahead)
869 {
870 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
871 unsigned slots_per_prefetch;
872 struct mem_ref *ref;
873 bool any = false;
874
875 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
876 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
877
878 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
879 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
880 it will need a prefetch slot. */
881 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
882 if (dump_file && (dump_flags & TDF_DETAILS))
883 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
884 slots_per_prefetch);
885
886 /* For now we just take memory references one by one and issue
887 prefetches for as many as possible. The groups are sorted
888 starting with the largest step, since the references with
889 large step are more likely to cause many cache misses. */
890
891 for (; groups; groups = groups->next)
892 for (ref = groups->refs; ref; ref = ref->next)
893 {
894 if (!should_issue_prefetch_p (ref))
895 continue;
896
897 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
898 and we unroll the loop UNROLL_FACTOR times, we need to insert
899 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
900 iteration. */
901 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
902 / ref->prefetch_mod);
903 prefetch_slots = n_prefetches * slots_per_prefetch;
904
905 /* If more than half of the prefetches would be lost anyway, do not
906 issue the prefetch. */
907 if (2 * remaining_prefetch_slots < prefetch_slots)
908 continue;
909
910 ref->issue_prefetch_p = true;
911
912 if (remaining_prefetch_slots <= prefetch_slots)
913 return true;
914 remaining_prefetch_slots -= prefetch_slots;
915 any = true;
916 }
917
918 return any;
919 }
920
921 /* Estimate the number of prefetches in the given GROUPS. */
922
923 static int
924 estimate_prefetch_count (struct mem_ref_group *groups)
925 {
926 struct mem_ref *ref;
927 int prefetch_count = 0;
928
929 for (; groups; groups = groups->next)
930 for (ref = groups->refs; ref; ref = ref->next)
931 if (should_issue_prefetch_p (ref))
932 prefetch_count++;
933
934 return prefetch_count;
935 }
936
937 /* Issue prefetches for the reference REF into loop as decided before.
938 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
939 is the factor by which LOOP was unrolled. */
940
941 static void
942 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
943 {
944 HOST_WIDE_INT delta;
945 tree addr, addr_base, write_p, local;
946 gimple prefetch;
947 gimple_stmt_iterator bsi;
948 unsigned n_prefetches, ap;
949 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
950
951 if (dump_file && (dump_flags & TDF_DETAILS))
952 fprintf (dump_file, "Issued%s prefetch for %p.\n",
953 nontemporal ? " nontemporal" : "",
954 (void *) ref);
955
956 bsi = gsi_for_stmt (ref->stmt);
957
958 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
959 / ref->prefetch_mod);
960 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
961 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
962 true, NULL, true, GSI_SAME_STMT);
963 write_p = ref->write_p ? integer_one_node : integer_zero_node;
964 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
965
966 for (ap = 0; ap < n_prefetches; ap++)
967 {
968 /* Determine the address to prefetch. */
969 delta = (ahead + ap * ref->prefetch_mod) * ref->group->step;
970 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
971 addr_base, size_int (delta));
972 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
973 true, GSI_SAME_STMT);
974
975 /* Create the prefetch instruction. */
976 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
977 3, addr, write_p, local);
978 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
979 }
980 }
981
982 /* Issue prefetches for the references in GROUPS into loop as decided before.
983 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
984 factor by that LOOP was unrolled. */
985
986 static void
987 issue_prefetches (struct mem_ref_group *groups,
988 unsigned unroll_factor, unsigned ahead)
989 {
990 struct mem_ref *ref;
991
992 for (; groups; groups = groups->next)
993 for (ref = groups->refs; ref; ref = ref->next)
994 if (ref->issue_prefetch_p)
995 issue_prefetch_ref (ref, unroll_factor, ahead);
996 }
997
998 /* Returns true if REF is a memory write for that a nontemporal store insn
999 can be used. */
1000
1001 static bool
1002 nontemporal_store_p (struct mem_ref *ref)
1003 {
1004 enum machine_mode mode;
1005 enum insn_code code;
1006
1007 /* REF must be a write that is not reused. We require it to be independent
1008 on all other memory references in the loop, as the nontemporal stores may
1009 be reordered with respect to other memory references. */
1010 if (!ref->write_p
1011 || !ref->independent_p
1012 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1013 return false;
1014
1015 /* Check that we have the storent instruction for the mode. */
1016 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1017 if (mode == BLKmode)
1018 return false;
1019
1020 code = optab_handler (storent_optab, mode)->insn_code;
1021 return code != CODE_FOR_nothing;
1022 }
1023
1024 /* If REF is a nontemporal store, we mark the corresponding modify statement
1025 and return true. Otherwise, we return false. */
1026
1027 static bool
1028 mark_nontemporal_store (struct mem_ref *ref)
1029 {
1030 if (!nontemporal_store_p (ref))
1031 return false;
1032
1033 if (dump_file && (dump_flags & TDF_DETAILS))
1034 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1035 (void *) ref);
1036
1037 gimple_assign_set_nontemporal_move (ref->stmt, true);
1038 ref->storent_p = true;
1039
1040 return true;
1041 }
1042
1043 /* Issue a memory fence instruction after LOOP. */
1044
1045 static void
1046 emit_mfence_after_loop (struct loop *loop)
1047 {
1048 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1049 edge exit;
1050 gimple call;
1051 gimple_stmt_iterator bsi;
1052 unsigned i;
1053
1054 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1055 {
1056 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1057
1058 if (!single_pred_p (exit->dest)
1059 /* If possible, we prefer not to insert the fence on other paths
1060 in cfg. */
1061 && !(exit->flags & EDGE_ABNORMAL))
1062 split_loop_exit_edge (exit);
1063 bsi = gsi_after_labels (exit->dest);
1064
1065 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1066 mark_virtual_ops_for_renaming (call);
1067 }
1068
1069 VEC_free (edge, heap, exits);
1070 update_ssa (TODO_update_ssa_only_virtuals);
1071 }
1072
1073 /* Returns true if we can use storent in loop, false otherwise. */
1074
1075 static bool
1076 may_use_storent_in_loop_p (struct loop *loop)
1077 {
1078 bool ret = true;
1079
1080 if (loop->inner != NULL)
1081 return false;
1082
1083 /* If we must issue a mfence insn after using storent, check that there
1084 is a suitable place for it at each of the loop exits. */
1085 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1086 {
1087 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1088 unsigned i;
1089 edge exit;
1090
1091 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1092 if ((exit->flags & EDGE_ABNORMAL)
1093 && exit->dest == EXIT_BLOCK_PTR)
1094 ret = false;
1095
1096 VEC_free (edge, heap, exits);
1097 }
1098
1099 return ret;
1100 }
1101
1102 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1103 references in the loop. */
1104
1105 static void
1106 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1107 {
1108 struct mem_ref *ref;
1109 bool any = false;
1110
1111 if (!may_use_storent_in_loop_p (loop))
1112 return;
1113
1114 for (; groups; groups = groups->next)
1115 for (ref = groups->refs; ref; ref = ref->next)
1116 any |= mark_nontemporal_store (ref);
1117
1118 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1119 emit_mfence_after_loop (loop);
1120 }
1121
1122 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1123 this is the case, fill in DESC by the description of number of
1124 iterations. */
1125
1126 static bool
1127 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1128 unsigned factor)
1129 {
1130 if (!can_unroll_loop_p (loop, factor, desc))
1131 return false;
1132
1133 /* We only consider loops without control flow for unrolling. This is not
1134 a hard restriction -- tree_unroll_loop works with arbitrary loops
1135 as well; but the unrolling/prefetching is usually more profitable for
1136 loops consisting of a single basic block, and we want to limit the
1137 code growth. */
1138 if (loop->num_nodes > 2)
1139 return false;
1140
1141 return true;
1142 }
1143
1144 /* Determine the coefficient by that unroll LOOP, from the information
1145 contained in the list of memory references REFS. Description of
1146 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1147 insns of the LOOP. EST_NITER is the estimated number of iterations of
1148 the loop, or -1 if no estimate is available. */
1149
1150 static unsigned
1151 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1152 unsigned ninsns, struct tree_niter_desc *desc,
1153 HOST_WIDE_INT est_niter)
1154 {
1155 unsigned upper_bound;
1156 unsigned nfactor, factor, mod_constraint;
1157 struct mem_ref_group *agp;
1158 struct mem_ref *ref;
1159
1160 /* First check whether the loop is not too large to unroll. We ignore
1161 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1162 from unrolling them enough to make exactly one cache line covered by each
1163 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1164 us from unrolling the loops too many times in cases where we only expect
1165 gains from better scheduling and decreasing loop overhead, which is not
1166 the case here. */
1167 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1168
1169 /* If we unrolled the loop more times than it iterates, the unrolled version
1170 of the loop would be never entered. */
1171 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1172 upper_bound = est_niter;
1173
1174 if (upper_bound <= 1)
1175 return 1;
1176
1177 /* Choose the factor so that we may prefetch each cache just once,
1178 but bound the unrolling by UPPER_BOUND. */
1179 factor = 1;
1180 for (agp = refs; agp; agp = agp->next)
1181 for (ref = agp->refs; ref; ref = ref->next)
1182 if (should_issue_prefetch_p (ref))
1183 {
1184 mod_constraint = ref->prefetch_mod;
1185 nfactor = least_common_multiple (mod_constraint, factor);
1186 if (nfactor <= upper_bound)
1187 factor = nfactor;
1188 }
1189
1190 if (!should_unroll_loop_p (loop, desc, factor))
1191 return 1;
1192
1193 return factor;
1194 }
1195
1196 /* Returns the total volume of the memory references REFS, taking into account
1197 reuses in the innermost loop and cache line size. TODO -- we should also
1198 take into account reuses across the iterations of the loops in the loop
1199 nest. */
1200
1201 static unsigned
1202 volume_of_references (struct mem_ref_group *refs)
1203 {
1204 unsigned volume = 0;
1205 struct mem_ref_group *gr;
1206 struct mem_ref *ref;
1207
1208 for (gr = refs; gr; gr = gr->next)
1209 for (ref = gr->refs; ref; ref = ref->next)
1210 {
1211 /* Almost always reuses another value? */
1212 if (ref->prefetch_before != PREFETCH_ALL)
1213 continue;
1214
1215 /* If several iterations access the same cache line, use the size of
1216 the line divided by this number. Otherwise, a cache line is
1217 accessed in each iteration. TODO -- in the latter case, we should
1218 take the size of the reference into account, rounding it up on cache
1219 line size multiple. */
1220 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1221 }
1222 return volume;
1223 }
1224
1225 /* Returns the volume of memory references accessed across VEC iterations of
1226 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1227 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1228
1229 static unsigned
1230 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1231 {
1232 unsigned i;
1233
1234 for (i = 0; i < n; i++)
1235 if (vec[i] != 0)
1236 break;
1237
1238 if (i == n)
1239 return 0;
1240
1241 gcc_assert (vec[i] > 0);
1242
1243 /* We ignore the parts of the distance vector in subloops, since usually
1244 the numbers of iterations are much smaller. */
1245 return loop_sizes[i] * vec[i];
1246 }
1247
1248 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1249 at the position corresponding to the loop of the step. N is the depth
1250 of the considered loop nest, and, LOOP is its innermost loop. */
1251
1252 static void
1253 add_subscript_strides (tree access_fn, unsigned stride,
1254 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1255 {
1256 struct loop *aloop;
1257 tree step;
1258 HOST_WIDE_INT astep;
1259 unsigned min_depth = loop_depth (loop) - n;
1260
1261 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1262 {
1263 aloop = get_chrec_loop (access_fn);
1264 step = CHREC_RIGHT (access_fn);
1265 access_fn = CHREC_LEFT (access_fn);
1266
1267 if ((unsigned) loop_depth (aloop) <= min_depth)
1268 continue;
1269
1270 if (host_integerp (step, 0))
1271 astep = tree_low_cst (step, 0);
1272 else
1273 astep = L1_CACHE_LINE_SIZE;
1274
1275 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1276
1277 }
1278 }
1279
1280 /* Returns the volume of memory references accessed between two consecutive
1281 self-reuses of the reference DR. We consider the subscripts of DR in N
1282 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1283 loops. LOOP is the innermost loop of the current loop nest. */
1284
1285 static unsigned
1286 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1287 struct loop *loop)
1288 {
1289 tree stride, access_fn;
1290 HOST_WIDE_INT *strides, astride;
1291 VEC (tree, heap) *access_fns;
1292 tree ref = DR_REF (dr);
1293 unsigned i, ret = ~0u;
1294
1295 /* In the following example:
1296
1297 for (i = 0; i < N; i++)
1298 for (j = 0; j < N; j++)
1299 use (a[j][i]);
1300 the same cache line is accessed each N steps (except if the change from
1301 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1302 we cannot rely purely on the results of the data dependence analysis.
1303
1304 Instead, we compute the stride of the reference in each loop, and consider
1305 the innermost loop in that the stride is less than cache size. */
1306
1307 strides = XCNEWVEC (HOST_WIDE_INT, n);
1308 access_fns = DR_ACCESS_FNS (dr);
1309
1310 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
1311 {
1312 /* Keep track of the reference corresponding to the subscript, so that we
1313 know its stride. */
1314 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1315 ref = TREE_OPERAND (ref, 0);
1316
1317 if (TREE_CODE (ref) == ARRAY_REF)
1318 {
1319 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1320 if (host_integerp (stride, 1))
1321 astride = tree_low_cst (stride, 1);
1322 else
1323 astride = L1_CACHE_LINE_SIZE;
1324
1325 ref = TREE_OPERAND (ref, 0);
1326 }
1327 else
1328 astride = 1;
1329
1330 add_subscript_strides (access_fn, astride, strides, n, loop);
1331 }
1332
1333 for (i = n; i-- > 0; )
1334 {
1335 unsigned HOST_WIDE_INT s;
1336
1337 s = strides[i] < 0 ? -strides[i] : strides[i];
1338
1339 if (s < (unsigned) L1_CACHE_LINE_SIZE
1340 && (loop_sizes[i]
1341 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1342 {
1343 ret = loop_sizes[i];
1344 break;
1345 }
1346 }
1347
1348 free (strides);
1349 return ret;
1350 }
1351
1352 /* Determines the distance till the first reuse of each reference in REFS
1353 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1354 memory references in the loop. */
1355
1356 static void
1357 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1358 bool no_other_refs)
1359 {
1360 struct loop *nest, *aloop;
1361 VEC (data_reference_p, heap) *datarefs = NULL;
1362 VEC (ddr_p, heap) *dependences = NULL;
1363 struct mem_ref_group *gr;
1364 struct mem_ref *ref, *refb;
1365 VEC (loop_p, heap) *vloops = NULL;
1366 unsigned *loop_data_size;
1367 unsigned i, j, n;
1368 unsigned volume, dist, adist;
1369 HOST_WIDE_INT vol;
1370 data_reference_p dr;
1371 ddr_p dep;
1372
1373 if (loop->inner)
1374 return;
1375
1376 /* Find the outermost loop of the loop nest of loop (we require that
1377 there are no sibling loops inside the nest). */
1378 nest = loop;
1379 while (1)
1380 {
1381 aloop = loop_outer (nest);
1382
1383 if (aloop == current_loops->tree_root
1384 || aloop->inner->next)
1385 break;
1386
1387 nest = aloop;
1388 }
1389
1390 /* For each loop, determine the amount of data accessed in each iteration.
1391 We use this to estimate whether the reference is evicted from the
1392 cache before its reuse. */
1393 find_loop_nest (nest, &vloops);
1394 n = VEC_length (loop_p, vloops);
1395 loop_data_size = XNEWVEC (unsigned, n);
1396 volume = volume_of_references (refs);
1397 i = n;
1398 while (i-- != 0)
1399 {
1400 loop_data_size[i] = volume;
1401 /* Bound the volume by the L2 cache size, since above this bound,
1402 all dependence distances are equivalent. */
1403 if (volume > L2_CACHE_SIZE_BYTES)
1404 continue;
1405
1406 aloop = VEC_index (loop_p, vloops, i);
1407 vol = estimated_loop_iterations_int (aloop, false);
1408 if (vol < 0)
1409 vol = expected_loop_iterations (aloop);
1410 volume *= vol;
1411 }
1412
1413 /* Prepare the references in the form suitable for data dependence
1414 analysis. We ignore unanalyzable data references (the results
1415 are used just as a heuristics to estimate temporality of the
1416 references, hence we do not need to worry about correctness). */
1417 for (gr = refs; gr; gr = gr->next)
1418 for (ref = gr->refs; ref; ref = ref->next)
1419 {
1420 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
1421
1422 if (dr)
1423 {
1424 ref->reuse_distance = volume;
1425 dr->aux = ref;
1426 VEC_safe_push (data_reference_p, heap, datarefs, dr);
1427 }
1428 else
1429 no_other_refs = false;
1430 }
1431
1432 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
1433 {
1434 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1435 ref = (struct mem_ref *) dr->aux;
1436 if (ref->reuse_distance > dist)
1437 ref->reuse_distance = dist;
1438
1439 if (no_other_refs)
1440 ref->independent_p = true;
1441 }
1442
1443 compute_all_dependences (datarefs, &dependences, vloops, true);
1444
1445 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
1446 {
1447 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1448 continue;
1449
1450 ref = (struct mem_ref *) DDR_A (dep)->aux;
1451 refb = (struct mem_ref *) DDR_B (dep)->aux;
1452
1453 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1454 || DDR_NUM_DIST_VECTS (dep) == 0)
1455 {
1456 /* If the dependence cannot be analyzed, assume that there might be
1457 a reuse. */
1458 dist = 0;
1459
1460 ref->independent_p = false;
1461 refb->independent_p = false;
1462 }
1463 else
1464 {
1465 /* The distance vectors are normalized to be always lexicographically
1466 positive, hence we cannot tell just from them whether DDR_A comes
1467 before DDR_B or vice versa. However, it is not important,
1468 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1469 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1470 in cache (and marking it as nontemporal would not affect
1471 anything). */
1472
1473 dist = volume;
1474 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1475 {
1476 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1477 loop_data_size, n);
1478
1479 /* If this is a dependence in the innermost loop (i.e., the
1480 distances in all superloops are zero) and it is not
1481 the trivial self-dependence with distance zero, record that
1482 the references are not completely independent. */
1483 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1484 && (ref != refb
1485 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1486 {
1487 ref->independent_p = false;
1488 refb->independent_p = false;
1489 }
1490
1491 /* Ignore accesses closer than
1492 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1493 so that we use nontemporal prefetches e.g. if single memory
1494 location is accessed several times in a single iteration of
1495 the loop. */
1496 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1497 continue;
1498
1499 if (adist < dist)
1500 dist = adist;
1501 }
1502 }
1503
1504 if (ref->reuse_distance > dist)
1505 ref->reuse_distance = dist;
1506 if (refb->reuse_distance > dist)
1507 refb->reuse_distance = dist;
1508 }
1509
1510 free_dependence_relations (dependences);
1511 free_data_refs (datarefs);
1512 free (loop_data_size);
1513
1514 if (dump_file && (dump_flags & TDF_DETAILS))
1515 {
1516 fprintf (dump_file, "Reuse distances:\n");
1517 for (gr = refs; gr; gr = gr->next)
1518 for (ref = gr->refs; ref; ref = ref->next)
1519 fprintf (dump_file, " ref %p distance %u\n",
1520 (void *) ref, ref->reuse_distance);
1521 }
1522 }
1523
1524 /* Do a cost-benefit analysis to determine if prefetching is profitable
1525 for the current loop given the following parameters:
1526 AHEAD: the iteration ahead distance,
1527 EST_NITER: the estimated trip count,
1528 NINSNS: estimated number of instructions in the loop,
1529 PREFETCH_COUNT: an estimate of the number of prefetches
1530 MEM_REF_COUNT: total number of memory references in the loop. */
1531
1532 static bool
1533 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1534 unsigned ninsns, unsigned prefetch_count,
1535 unsigned mem_ref_count, unsigned unroll_factor)
1536 {
1537 int insn_to_mem_ratio, insn_to_prefetch_ratio;
1538
1539 if (mem_ref_count == 0)
1540 return false;
1541
1542 /* Prefetching improves performance by overlapping cache missing
1543 memory accesses with CPU operations. If the loop does not have
1544 enough CPU operations to overlap with memory operations, prefetching
1545 won't give a significant benefit. One approximate way of checking
1546 this is to require the ratio of instructions to memory references to
1547 be above a certain limit. This approximation works well in practice.
1548 TODO: Implement a more precise computation by estimating the time
1549 for each CPU or memory op in the loop. Time estimates for memory ops
1550 should account for cache misses. */
1551 insn_to_mem_ratio = ninsns / mem_ref_count;
1552
1553 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1554 {
1555 if (dump_file && (dump_flags & TDF_DETAILS))
1556 fprintf (dump_file,
1557 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1558 insn_to_mem_ratio);
1559 return false;
1560 }
1561
1562 /* Profitability of prefetching is highly dependent on the trip count.
1563 For a given AHEAD distance, the first AHEAD iterations do not benefit
1564 from prefetching, and the last AHEAD iterations execute useless
1565 prefetches. So, if the trip count is not large enough relative to AHEAD,
1566 prefetching may cause serious performance degradation. To avoid this
1567 problem when the trip count is not known at compile time, we
1568 conservatively skip loops with high prefetching costs. For now, only
1569 the I-cache cost is considered. The relative I-cache cost is estimated
1570 by taking the ratio between the number of prefetches and the total
1571 number of instructions. Since we are using integer arithmetic, we
1572 compute the reciprocal of this ratio.
1573 (unroll_factor * ninsns) is used to estimate the number of instructions in
1574 the unrolled loop. This implementation is a bit simplistic -- the number
1575 of issued prefetch instructions is also affected by unrolling. So,
1576 prefetch_mod and the unroll factor should be taken into account when
1577 determining prefetch_count. Also, the number of insns of the unrolled
1578 loop will usually be significantly smaller than the number of insns of the
1579 original loop * unroll_factor (at least the induction variable increases
1580 and the exit branches will get eliminated), so it might be better to use
1581 tree_estimate_loop_size + estimated_unrolled_size. */
1582 if (est_niter < 0)
1583 {
1584 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1585 return insn_to_prefetch_ratio >= MIN_INSN_TO_PREFETCH_RATIO;
1586 }
1587
1588 if (est_niter <= (HOST_WIDE_INT) ahead)
1589 {
1590 if (dump_file && (dump_flags & TDF_DETAILS))
1591 fprintf (dump_file,
1592 "Not prefetching -- loop estimated to roll only %d times\n",
1593 (int) est_niter);
1594 return false;
1595 }
1596 return true;
1597 }
1598
1599
1600 /* Issue prefetch instructions for array references in LOOP. Returns
1601 true if the LOOP was unrolled. */
1602
1603 static bool
1604 loop_prefetch_arrays (struct loop *loop)
1605 {
1606 struct mem_ref_group *refs;
1607 unsigned ahead, ninsns, time, unroll_factor;
1608 HOST_WIDE_INT est_niter;
1609 struct tree_niter_desc desc;
1610 bool unrolled = false, no_other_refs;
1611 unsigned prefetch_count;
1612 unsigned mem_ref_count;
1613
1614 if (optimize_loop_nest_for_size_p (loop))
1615 {
1616 if (dump_file && (dump_flags & TDF_DETAILS))
1617 fprintf (dump_file, " ignored (cold area)\n");
1618 return false;
1619 }
1620
1621 /* Step 1: gather the memory references. */
1622 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1623
1624 /* Step 2: estimate the reuse effects. */
1625 prune_by_reuse (refs);
1626
1627 prefetch_count = estimate_prefetch_count (refs);
1628 if (prefetch_count == 0)
1629 goto fail;
1630
1631 determine_loop_nest_reuse (loop, refs, no_other_refs);
1632
1633 /* Step 3: determine the ahead and unroll factor. */
1634
1635 /* FIXME: the time should be weighted by the probabilities of the blocks in
1636 the loop body. */
1637 time = tree_num_loop_insns (loop, &eni_time_weights);
1638 ahead = (PREFETCH_LATENCY + time - 1) / time;
1639 est_niter = estimated_loop_iterations_int (loop, false);
1640
1641 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1642 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1643 est_niter);
1644 if (dump_file && (dump_flags & TDF_DETAILS))
1645 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1646 HOST_WIDE_INT_PRINT_DEC "\n"
1647 "insn count %d, mem ref count %d, prefetch count %d\n",
1648 ahead, unroll_factor, est_niter,
1649 ninsns, mem_ref_count, prefetch_count);
1650
1651 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count,
1652 mem_ref_count, unroll_factor))
1653 goto fail;
1654
1655 mark_nontemporal_stores (loop, refs);
1656
1657 /* Step 4: what to prefetch? */
1658 if (!schedule_prefetches (refs, unroll_factor, ahead))
1659 goto fail;
1660
1661 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1662 iterations so that we do not issue superfluous prefetches. */
1663 if (unroll_factor != 1)
1664 {
1665 tree_unroll_loop (loop, unroll_factor,
1666 single_dom_exit (loop), &desc);
1667 unrolled = true;
1668 }
1669
1670 /* Step 6: issue the prefetches. */
1671 issue_prefetches (refs, unroll_factor, ahead);
1672
1673 fail:
1674 release_mem_refs (refs);
1675 return unrolled;
1676 }
1677
1678 /* Issue prefetch instructions for array references in loops. */
1679
1680 unsigned int
1681 tree_ssa_prefetch_arrays (void)
1682 {
1683 loop_iterator li;
1684 struct loop *loop;
1685 bool unrolled = false;
1686 int todo_flags = 0;
1687
1688 if (!HAVE_prefetch
1689 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1690 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1691 of processor costs and i486 does not have prefetch, but
1692 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1693 || PREFETCH_BLOCK == 0)
1694 return 0;
1695
1696 if (dump_file && (dump_flags & TDF_DETAILS))
1697 {
1698 fprintf (dump_file, "Prefetching parameters:\n");
1699 fprintf (dump_file, " simultaneous prefetches: %d\n",
1700 SIMULTANEOUS_PREFETCHES);
1701 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1702 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1703 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1704 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1705 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1706 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1707 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1708 MIN_INSN_TO_PREFETCH_RATIO);
1709 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1710 PREFETCH_MIN_INSN_TO_MEM_RATIO);
1711 fprintf (dump_file, "\n");
1712 }
1713
1714 initialize_original_copy_tables ();
1715
1716 if (!built_in_decls[BUILT_IN_PREFETCH])
1717 {
1718 tree type = build_function_type (void_type_node,
1719 tree_cons (NULL_TREE,
1720 const_ptr_type_node,
1721 NULL_TREE));
1722 tree decl = add_builtin_function ("__builtin_prefetch", type,
1723 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1724 NULL, NULL_TREE);
1725 DECL_IS_NOVOPS (decl) = true;
1726 built_in_decls[BUILT_IN_PREFETCH] = decl;
1727 }
1728
1729 /* We assume that size of cache line is a power of two, so verify this
1730 here. */
1731 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1732
1733 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1734 {
1735 if (dump_file && (dump_flags & TDF_DETAILS))
1736 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1737
1738 unrolled |= loop_prefetch_arrays (loop);
1739
1740 if (dump_file && (dump_flags & TDF_DETAILS))
1741 fprintf (dump_file, "\n\n");
1742 }
1743
1744 if (unrolled)
1745 {
1746 scev_reset ();
1747 todo_flags |= TODO_cleanup_cfg;
1748 }
1749
1750 free_original_copy_tables ();
1751 return todo_flags;
1752 }