diagnostic.c: Don't include tm.h, tree.h, tm_p.h, langhooks.h or langhooks-def.h.
[gcc.git] / gcc / tree-ssa-loop-prefetch.c
1 /* Array prefetching.
2 Copyright (C) 2005, 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tm.h"
24 #include "tree.h"
25 #include "tm_p.h"
26 #include "basic-block.h"
27 #include "output.h"
28 #include "diagnostic.h"
29 #include "tree-pretty-print.h"
30 #include "tree-flow.h"
31 #include "tree-dump.h"
32 #include "timevar.h"
33 #include "cfgloop.h"
34 #include "expr.h"
35 #include "tree-pass.h"
36 #include "insn-config.h"
37 #include "recog.h"
38 #include "hashtab.h"
39 #include "tree-chrec.h"
40 #include "tree-scalar-evolution.h"
41 #include "toplev.h"
42 #include "params.h"
43 #include "langhooks.h"
44 #include "tree-inline.h"
45 #include "tree-data-ref.h"
46 #include "optabs.h"
47
48 /* This pass inserts prefetch instructions to optimize cache usage during
49 accesses to arrays in loops. It processes loops sequentially and:
50
51 1) Gathers all memory references in the single loop.
52 2) For each of the references it decides when it is profitable to prefetch
53 it. To do it, we evaluate the reuse among the accesses, and determines
54 two values: PREFETCH_BEFORE (meaning that it only makes sense to do
55 prefetching in the first PREFETCH_BEFORE iterations of the loop) and
56 PREFETCH_MOD (meaning that it only makes sense to prefetch in the
57 iterations of the loop that are zero modulo PREFETCH_MOD). For example
58 (assuming cache line size is 64 bytes, char has size 1 byte and there
59 is no hardware sequential prefetch):
60
61 char *a;
62 for (i = 0; i < max; i++)
63 {
64 a[255] = ...; (0)
65 a[i] = ...; (1)
66 a[i + 64] = ...; (2)
67 a[16*i] = ...; (3)
68 a[187*i] = ...; (4)
69 a[187*i + 50] = ...; (5)
70 }
71
72 (0) obviously has PREFETCH_BEFORE 1
73 (1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
74 location 64 iterations before it, and PREFETCH_MOD 64 (since
75 it hits the same cache line otherwise).
76 (2) has PREFETCH_MOD 64
77 (3) has PREFETCH_MOD 4
78 (4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
79 the cache line accessed by (4) is the same with probability only
80 7/32.
81 (5) has PREFETCH_MOD 1 as well.
82
83 Additionally, we use data dependence analysis to determine for each
84 reference the distance till the first reuse; this information is used
85 to determine the temporality of the issued prefetch instruction.
86
87 3) We determine how much ahead we need to prefetch. The number of
88 iterations needed is time to fetch / time spent in one iteration of
89 the loop. The problem is that we do not know either of these values,
90 so we just make a heuristic guess based on a magic (possibly)
91 target-specific constant and size of the loop.
92
93 4) Determine which of the references we prefetch. We take into account
94 that there is a maximum number of simultaneous prefetches (provided
95 by machine description). We prefetch as many prefetches as possible
96 while still within this bound (starting with those with lowest
97 prefetch_mod, since they are responsible for most of the cache
98 misses).
99
100 5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
101 and PREFETCH_BEFORE requirements (within some bounds), and to avoid
102 prefetching nonaccessed memory.
103 TODO -- actually implement peeling.
104
105 6) We actually emit the prefetch instructions. ??? Perhaps emit the
106 prefetch instructions with guards in cases where 5) was not sufficient
107 to satisfy the constraints?
108
109 The function is_loop_prefetching_profitable() implements a cost model
110 to determine if prefetching is profitable for a given loop. The cost
111 model has two heuristcs:
112 1. A heuristic that determines whether the given loop has enough CPU
113 ops that can be overlapped with cache missing memory ops.
114 If not, the loop won't benefit from prefetching. This is implemented
115 by requirung the ratio between the instruction count and the mem ref
116 count to be above a certain minimum.
117 2. A heuristic that disables prefetching in a loop with an unknown trip
118 count if the prefetching cost is above a certain limit. The relative
119 prefetching cost is estimated by taking the ratio between the
120 prefetch count and the total intruction count (this models the I-cache
121 cost).
122 The limits used in these heuristics are defined as parameters with
123 reasonable default values. Machine-specific default values will be
124 added later.
125
126 Some other TODO:
127 -- write and use more general reuse analysis (that could be also used
128 in other cache aimed loop optimizations)
129 -- make it behave sanely together with the prefetches given by user
130 (now we just ignore them; at the very least we should avoid
131 optimizing loops in that user put his own prefetches)
132 -- we assume cache line size alignment of arrays; this could be
133 improved. */
134
135 /* Magic constants follow. These should be replaced by machine specific
136 numbers. */
137
138 /* True if write can be prefetched by a read prefetch. */
139
140 #ifndef WRITE_CAN_USE_READ_PREFETCH
141 #define WRITE_CAN_USE_READ_PREFETCH 1
142 #endif
143
144 /* True if read can be prefetched by a write prefetch. */
145
146 #ifndef READ_CAN_USE_WRITE_PREFETCH
147 #define READ_CAN_USE_WRITE_PREFETCH 0
148 #endif
149
150 /* The size of the block loaded by a single prefetch. Usually, this is
151 the same as cache line size (at the moment, we only consider one level
152 of cache hierarchy). */
153
154 #ifndef PREFETCH_BLOCK
155 #define PREFETCH_BLOCK L1_CACHE_LINE_SIZE
156 #endif
157
158 /* Do we have a forward hardware sequential prefetching? */
159
160 #ifndef HAVE_FORWARD_PREFETCH
161 #define HAVE_FORWARD_PREFETCH 0
162 #endif
163
164 /* Do we have a backward hardware sequential prefetching? */
165
166 #ifndef HAVE_BACKWARD_PREFETCH
167 #define HAVE_BACKWARD_PREFETCH 0
168 #endif
169
170 /* In some cases we are only able to determine that there is a certain
171 probability that the two accesses hit the same cache line. In this
172 case, we issue the prefetches for both of them if this probability
173 is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. */
174
175 #ifndef ACCEPTABLE_MISS_RATE
176 #define ACCEPTABLE_MISS_RATE 50
177 #endif
178
179 #ifndef HAVE_prefetch
180 #define HAVE_prefetch 0
181 #endif
182
183 #define L1_CACHE_SIZE_BYTES ((unsigned) (L1_CACHE_SIZE * 1024))
184 #define L2_CACHE_SIZE_BYTES ((unsigned) (L2_CACHE_SIZE * 1024))
185
186 /* We consider a memory access nontemporal if it is not reused sooner than
187 after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
188 accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
189 so that we use nontemporal prefetches e.g. if single memory location
190 is accessed several times in a single iteration of the loop. */
191 #define NONTEMPORAL_FRACTION 16
192
193 /* In case we have to emit a memory fence instruction after the loop that
194 uses nontemporal stores, this defines the builtin to use. */
195
196 #ifndef FENCE_FOLLOWING_MOVNT
197 #define FENCE_FOLLOWING_MOVNT NULL_TREE
198 #endif
199
200 /* It is not profitable to prefetch when the trip count is not at
201 least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
202 For example, in a loop with a prefetch ahead distance of 10,
203 supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
204 profitable to prefetch when the trip count is greater or equal to
205 40. In that case, 30 out of the 40 iterations will benefit from
206 prefetching. */
207
208 #ifndef TRIP_COUNT_TO_AHEAD_RATIO
209 #define TRIP_COUNT_TO_AHEAD_RATIO 4
210 #endif
211
212 /* The group of references between that reuse may occur. */
213
214 struct mem_ref_group
215 {
216 tree base; /* Base of the reference. */
217 tree step; /* Step of the reference. */
218 struct mem_ref *refs; /* References in the group. */
219 struct mem_ref_group *next; /* Next group of references. */
220 };
221
222 /* Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. */
223
224 #define PREFETCH_ALL (~(unsigned HOST_WIDE_INT) 0)
225
226 /* Do not generate a prefetch if the unroll factor is significantly less
227 than what is required by the prefetch. This is to avoid redundant
228 prefetches. For example, if prefetch_mod is 16 and unroll_factor is
229 1, this means prefetching requires unrolling the loop 16 times, but
230 the loop is not going to be unrolled. In this case (ratio = 16),
231 prefetching is not likely to be beneficial. */
232
233 #ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
234 #define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 8
235 #endif
236
237 /* The memory reference. */
238
239 struct mem_ref
240 {
241 gimple stmt; /* Statement in that the reference appears. */
242 tree mem; /* The reference. */
243 HOST_WIDE_INT delta; /* Constant offset of the reference. */
244 struct mem_ref_group *group; /* The group of references it belongs to. */
245 unsigned HOST_WIDE_INT prefetch_mod;
246 /* Prefetch only each PREFETCH_MOD-th
247 iteration. */
248 unsigned HOST_WIDE_INT prefetch_before;
249 /* Prefetch only first PREFETCH_BEFORE
250 iterations. */
251 unsigned reuse_distance; /* The amount of data accessed before the first
252 reuse of this value. */
253 struct mem_ref *next; /* The next reference in the group. */
254 unsigned write_p : 1; /* Is it a write? */
255 unsigned independent_p : 1; /* True if the reference is independent on
256 all other references inside the loop. */
257 unsigned issue_prefetch_p : 1; /* Should we really issue the prefetch? */
258 unsigned storent_p : 1; /* True if we changed the store to a
259 nontemporal one. */
260 };
261
262 /* Dumps information about reference REF to FILE. */
263
264 static void
265 dump_mem_ref (FILE *file, struct mem_ref *ref)
266 {
267 fprintf (file, "Reference %p:\n", (void *) ref);
268
269 fprintf (file, " group %p (base ", (void *) ref->group);
270 print_generic_expr (file, ref->group->base, TDF_SLIM);
271 fprintf (file, ", step ");
272 if (cst_and_fits_in_hwi (ref->group->step))
273 fprintf (file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (ref->group->step));
274 else
275 print_generic_expr (file, ref->group->step, TDF_TREE);
276 fprintf (file, ")\n");
277
278 fprintf (file, " delta ");
279 fprintf (file, HOST_WIDE_INT_PRINT_DEC, ref->delta);
280 fprintf (file, "\n");
281
282 fprintf (file, " %s\n", ref->write_p ? "write" : "read");
283
284 fprintf (file, "\n");
285 }
286
287 /* Finds a group with BASE and STEP in GROUPS, or creates one if it does not
288 exist. */
289
290 static struct mem_ref_group *
291 find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
292 {
293 struct mem_ref_group *group;
294
295 for (; *groups; groups = &(*groups)->next)
296 {
297 if (operand_equal_p ((*groups)->step, step, 0)
298 && operand_equal_p ((*groups)->base, base, 0))
299 return *groups;
300
301 /* If step is an integer constant, keep the list of groups sorted
302 by decreasing step. */
303 if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
304 && int_cst_value ((*groups)->step) < int_cst_value (step))
305 break;
306 }
307
308 group = XNEW (struct mem_ref_group);
309 group->base = base;
310 group->step = step;
311 group->refs = NULL;
312 group->next = *groups;
313 *groups = group;
314
315 return group;
316 }
317
318 /* Records a memory reference MEM in GROUP with offset DELTA and write status
319 WRITE_P. The reference occurs in statement STMT. */
320
321 static void
322 record_ref (struct mem_ref_group *group, gimple stmt, tree mem,
323 HOST_WIDE_INT delta, bool write_p)
324 {
325 struct mem_ref **aref;
326
327 /* Do not record the same address twice. */
328 for (aref = &group->refs; *aref; aref = &(*aref)->next)
329 {
330 /* It does not have to be possible for write reference to reuse the read
331 prefetch, or vice versa. */
332 if (!WRITE_CAN_USE_READ_PREFETCH
333 && write_p
334 && !(*aref)->write_p)
335 continue;
336 if (!READ_CAN_USE_WRITE_PREFETCH
337 && !write_p
338 && (*aref)->write_p)
339 continue;
340
341 if ((*aref)->delta == delta)
342 return;
343 }
344
345 (*aref) = XNEW (struct mem_ref);
346 (*aref)->stmt = stmt;
347 (*aref)->mem = mem;
348 (*aref)->delta = delta;
349 (*aref)->write_p = write_p;
350 (*aref)->prefetch_before = PREFETCH_ALL;
351 (*aref)->prefetch_mod = 1;
352 (*aref)->reuse_distance = 0;
353 (*aref)->issue_prefetch_p = false;
354 (*aref)->group = group;
355 (*aref)->next = NULL;
356 (*aref)->independent_p = false;
357 (*aref)->storent_p = false;
358
359 if (dump_file && (dump_flags & TDF_DETAILS))
360 dump_mem_ref (dump_file, *aref);
361 }
362
363 /* Release memory references in GROUPS. */
364
365 static void
366 release_mem_refs (struct mem_ref_group *groups)
367 {
368 struct mem_ref_group *next_g;
369 struct mem_ref *ref, *next_r;
370
371 for (; groups; groups = next_g)
372 {
373 next_g = groups->next;
374 for (ref = groups->refs; ref; ref = next_r)
375 {
376 next_r = ref->next;
377 free (ref);
378 }
379 free (groups);
380 }
381 }
382
383 /* A structure used to pass arguments to idx_analyze_ref. */
384
385 struct ar_data
386 {
387 struct loop *loop; /* Loop of the reference. */
388 gimple stmt; /* Statement of the reference. */
389 tree *step; /* Step of the memory reference. */
390 HOST_WIDE_INT *delta; /* Offset of the memory reference. */
391 };
392
393 /* Analyzes a single INDEX of a memory reference to obtain information
394 described at analyze_ref. Callback for for_each_index. */
395
396 static bool
397 idx_analyze_ref (tree base, tree *index, void *data)
398 {
399 struct ar_data *ar_data = (struct ar_data *) data;
400 tree ibase, step, stepsize;
401 HOST_WIDE_INT idelta = 0, imult = 1;
402 affine_iv iv;
403
404 if (TREE_CODE (base) == MISALIGNED_INDIRECT_REF
405 || TREE_CODE (base) == ALIGN_INDIRECT_REF)
406 return false;
407
408 if (!simple_iv (ar_data->loop, loop_containing_stmt (ar_data->stmt),
409 *index, &iv, true))
410 return false;
411 ibase = iv.base;
412 step = iv.step;
413
414 if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
415 && cst_and_fits_in_hwi (TREE_OPERAND (ibase, 1)))
416 {
417 idelta = int_cst_value (TREE_OPERAND (ibase, 1));
418 ibase = TREE_OPERAND (ibase, 0);
419 }
420 if (cst_and_fits_in_hwi (ibase))
421 {
422 idelta += int_cst_value (ibase);
423 ibase = build_int_cst (TREE_TYPE (ibase), 0);
424 }
425
426 if (*ar_data->step == NULL_TREE)
427 *ar_data->step = step;
428 else
429 *ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
430 fold_convert (sizetype, *ar_data->step),
431 fold_convert (sizetype, step));
432 if (TREE_CODE (base) == ARRAY_REF)
433 {
434 stepsize = array_ref_element_size (base);
435 if (!cst_and_fits_in_hwi (stepsize))
436 return false;
437 imult = int_cst_value (stepsize);
438
439 *ar_data->step = fold_build2 (MULT_EXPR, sizetype,
440 fold_convert (sizetype, *ar_data->step),
441 fold_convert (sizetype, step));
442 idelta *= imult;
443 }
444
445 *ar_data->delta += idelta;
446 *index = ibase;
447
448 return true;
449 }
450
451 /* Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and
452 STEP are integer constants and iter is number of iterations of LOOP. The
453 reference occurs in statement STMT. Strips nonaddressable component
454 references from REF_P. */
455
456 static bool
457 analyze_ref (struct loop *loop, tree *ref_p, tree *base,
458 tree *step, HOST_WIDE_INT *delta,
459 gimple stmt)
460 {
461 struct ar_data ar_data;
462 tree off;
463 HOST_WIDE_INT bit_offset;
464 tree ref = *ref_p;
465
466 *step = NULL_TREE;
467 *delta = 0;
468
469 /* First strip off the component references. Ignore bitfields. */
470 if (TREE_CODE (ref) == COMPONENT_REF
471 && DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, 1)))
472 ref = TREE_OPERAND (ref, 0);
473
474 *ref_p = ref;
475
476 for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, 0))
477 {
478 off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, 1));
479 bit_offset = TREE_INT_CST_LOW (off);
480 gcc_assert (bit_offset % BITS_PER_UNIT == 0);
481
482 *delta += bit_offset / BITS_PER_UNIT;
483 }
484
485 *base = unshare_expr (ref);
486 ar_data.loop = loop;
487 ar_data.stmt = stmt;
488 ar_data.step = step;
489 ar_data.delta = delta;
490 return for_each_index (base, idx_analyze_ref, &ar_data);
491 }
492
493 /* Record a memory reference REF to the list REFS. The reference occurs in
494 LOOP in statement STMT and it is write if WRITE_P. Returns true if the
495 reference was recorded, false otherwise. */
496
497 static bool
498 gather_memory_references_ref (struct loop *loop, struct mem_ref_group **refs,
499 tree ref, bool write_p, gimple stmt)
500 {
501 tree base, step;
502 HOST_WIDE_INT delta;
503 struct mem_ref_group *agrp;
504
505 if (get_base_address (ref) == NULL)
506 return false;
507
508 if (!analyze_ref (loop, &ref, &base, &step, &delta, stmt))
509 return false;
510 /* If analyze_ref fails the default is a NULL_TREE. We can stop here. */
511 if (step == NULL_TREE)
512 return false;
513
514 /* Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP
515 are integer constants. */
516 agrp = find_or_create_group (refs, base, step);
517 record_ref (agrp, stmt, ref, delta, write_p);
518
519 return true;
520 }
521
522 /* Record the suitable memory references in LOOP. NO_OTHER_REFS is set to
523 true if there are no other memory references inside the loop. */
524
525 static struct mem_ref_group *
526 gather_memory_references (struct loop *loop, bool *no_other_refs, unsigned *ref_count)
527 {
528 basic_block *body = get_loop_body_in_dom_order (loop);
529 basic_block bb;
530 unsigned i;
531 gimple_stmt_iterator bsi;
532 gimple stmt;
533 tree lhs, rhs;
534 struct mem_ref_group *refs = NULL;
535
536 *no_other_refs = true;
537 *ref_count = 0;
538
539 /* Scan the loop body in order, so that the former references precede the
540 later ones. */
541 for (i = 0; i < loop->num_nodes; i++)
542 {
543 bb = body[i];
544 if (bb->loop_father != loop)
545 continue;
546
547 for (bsi = gsi_start_bb (bb); !gsi_end_p (bsi); gsi_next (&bsi))
548 {
549 stmt = gsi_stmt (bsi);
550
551 if (gimple_code (stmt) != GIMPLE_ASSIGN)
552 {
553 if (gimple_vuse (stmt)
554 || (is_gimple_call (stmt)
555 && !(gimple_call_flags (stmt) & ECF_CONST)))
556 *no_other_refs = false;
557 continue;
558 }
559
560 lhs = gimple_assign_lhs (stmt);
561 rhs = gimple_assign_rhs1 (stmt);
562
563 if (REFERENCE_CLASS_P (rhs))
564 {
565 *no_other_refs &= gather_memory_references_ref (loop, &refs,
566 rhs, false, stmt);
567 *ref_count += 1;
568 }
569 if (REFERENCE_CLASS_P (lhs))
570 {
571 *no_other_refs &= gather_memory_references_ref (loop, &refs,
572 lhs, true, stmt);
573 *ref_count += 1;
574 }
575 }
576 }
577 free (body);
578
579 return refs;
580 }
581
582 /* Prune the prefetch candidate REF using the self-reuse. */
583
584 static void
585 prune_ref_by_self_reuse (struct mem_ref *ref)
586 {
587 HOST_WIDE_INT step;
588 bool backward;
589
590 /* If the step size is non constant, we cannot calculate prefetch_mod. */
591 if (!cst_and_fits_in_hwi (ref->group->step))
592 return;
593
594 step = int_cst_value (ref->group->step);
595
596 backward = step < 0;
597
598 if (step == 0)
599 {
600 /* Prefetch references to invariant address just once. */
601 ref->prefetch_before = 1;
602 return;
603 }
604
605 if (backward)
606 step = -step;
607
608 if (step > PREFETCH_BLOCK)
609 return;
610
611 if ((backward && HAVE_BACKWARD_PREFETCH)
612 || (!backward && HAVE_FORWARD_PREFETCH))
613 {
614 ref->prefetch_before = 1;
615 return;
616 }
617
618 ref->prefetch_mod = PREFETCH_BLOCK / step;
619 }
620
621 /* Divides X by BY, rounding down. */
622
623 static HOST_WIDE_INT
624 ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
625 {
626 gcc_assert (by > 0);
627
628 if (x >= 0)
629 return x / by;
630 else
631 return (x + by - 1) / by;
632 }
633
634 /* Given a CACHE_LINE_SIZE and two inductive memory references
635 with a common STEP greater than CACHE_LINE_SIZE and an address
636 difference DELTA, compute the probability that they will fall
637 in different cache lines. DISTINCT_ITERS is the number of
638 distinct iterations after which the pattern repeats itself.
639 ALIGN_UNIT is the unit of alignment in bytes. */
640
641 static int
642 compute_miss_rate (unsigned HOST_WIDE_INT cache_line_size,
643 HOST_WIDE_INT step, HOST_WIDE_INT delta,
644 unsigned HOST_WIDE_INT distinct_iters,
645 int align_unit)
646 {
647 unsigned align, iter;
648 int total_positions, miss_positions, miss_rate;
649 int address1, address2, cache_line1, cache_line2;
650
651 total_positions = 0;
652 miss_positions = 0;
653
654 /* Iterate through all possible alignments of the first
655 memory reference within its cache line. */
656 for (align = 0; align < cache_line_size; align += align_unit)
657
658 /* Iterate through all distinct iterations. */
659 for (iter = 0; iter < distinct_iters; iter++)
660 {
661 address1 = align + step * iter;
662 address2 = address1 + delta;
663 cache_line1 = address1 / cache_line_size;
664 cache_line2 = address2 / cache_line_size;
665 total_positions += 1;
666 if (cache_line1 != cache_line2)
667 miss_positions += 1;
668 }
669 miss_rate = 1000 * miss_positions / total_positions;
670 return miss_rate;
671 }
672
673 /* Prune the prefetch candidate REF using the reuse with BY.
674 If BY_IS_BEFORE is true, BY is before REF in the loop. */
675
676 static void
677 prune_ref_by_group_reuse (struct mem_ref *ref, struct mem_ref *by,
678 bool by_is_before)
679 {
680 HOST_WIDE_INT step;
681 bool backward;
682 HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
683 HOST_WIDE_INT delta = delta_b - delta_r;
684 HOST_WIDE_INT hit_from;
685 unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
686 int miss_rate;
687 HOST_WIDE_INT reduced_step;
688 unsigned HOST_WIDE_INT reduced_prefetch_block;
689 tree ref_type;
690 int align_unit;
691
692 /* If the step is non constant we cannot calculate prefetch_before. */
693 if (!cst_and_fits_in_hwi (ref->group->step)) {
694 return;
695 }
696
697 step = int_cst_value (ref->group->step);
698
699 backward = step < 0;
700
701
702 if (delta == 0)
703 {
704 /* If the references has the same address, only prefetch the
705 former. */
706 if (by_is_before)
707 ref->prefetch_before = 0;
708
709 return;
710 }
711
712 if (!step)
713 {
714 /* If the reference addresses are invariant and fall into the
715 same cache line, prefetch just the first one. */
716 if (!by_is_before)
717 return;
718
719 if (ddown (ref->delta, PREFETCH_BLOCK)
720 != ddown (by->delta, PREFETCH_BLOCK))
721 return;
722
723 ref->prefetch_before = 0;
724 return;
725 }
726
727 /* Only prune the reference that is behind in the array. */
728 if (backward)
729 {
730 if (delta > 0)
731 return;
732
733 /* Transform the data so that we may assume that the accesses
734 are forward. */
735 delta = - delta;
736 step = -step;
737 delta_r = PREFETCH_BLOCK - 1 - delta_r;
738 delta_b = PREFETCH_BLOCK - 1 - delta_b;
739 }
740 else
741 {
742 if (delta < 0)
743 return;
744 }
745
746 /* Check whether the two references are likely to hit the same cache
747 line, and how distant the iterations in that it occurs are from
748 each other. */
749
750 if (step <= PREFETCH_BLOCK)
751 {
752 /* The accesses are sure to meet. Let us check when. */
753 hit_from = ddown (delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
754 prefetch_before = (hit_from - delta_r + step - 1) / step;
755
756 /* Do not reduce prefetch_before if we meet beyond cache size. */
757 if (prefetch_before > (unsigned) abs (L2_CACHE_SIZE_BYTES / step))
758 prefetch_before = PREFETCH_ALL;
759 if (prefetch_before < ref->prefetch_before)
760 ref->prefetch_before = prefetch_before;
761
762 return;
763 }
764
765 /* A more complicated case with step > prefetch_block. First reduce
766 the ratio between the step and the cache line size to its simplest
767 terms. The resulting denominator will then represent the number of
768 distinct iterations after which each address will go back to its
769 initial location within the cache line. This computation assumes
770 that PREFETCH_BLOCK is a power of two. */
771 prefetch_block = PREFETCH_BLOCK;
772 reduced_prefetch_block = prefetch_block;
773 reduced_step = step;
774 while ((reduced_step & 1) == 0
775 && reduced_prefetch_block > 1)
776 {
777 reduced_step >>= 1;
778 reduced_prefetch_block >>= 1;
779 }
780
781 prefetch_before = delta / step;
782 delta %= step;
783 ref_type = TREE_TYPE (ref->mem);
784 align_unit = TYPE_ALIGN (ref_type) / 8;
785 miss_rate = compute_miss_rate(prefetch_block, step, delta,
786 reduced_prefetch_block, align_unit);
787 if (miss_rate <= ACCEPTABLE_MISS_RATE)
788 {
789 /* Do not reduce prefetch_before if we meet beyond cache size. */
790 if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
791 prefetch_before = PREFETCH_ALL;
792 if (prefetch_before < ref->prefetch_before)
793 ref->prefetch_before = prefetch_before;
794
795 return;
796 }
797
798 /* Try also the following iteration. */
799 prefetch_before++;
800 delta = step - delta;
801 miss_rate = compute_miss_rate(prefetch_block, step, delta,
802 reduced_prefetch_block, align_unit);
803 if (miss_rate <= ACCEPTABLE_MISS_RATE)
804 {
805 if (prefetch_before < ref->prefetch_before)
806 ref->prefetch_before = prefetch_before;
807
808 return;
809 }
810
811 /* The ref probably does not reuse by. */
812 return;
813 }
814
815 /* Prune the prefetch candidate REF using the reuses with other references
816 in REFS. */
817
818 static void
819 prune_ref_by_reuse (struct mem_ref *ref, struct mem_ref *refs)
820 {
821 struct mem_ref *prune_by;
822 bool before = true;
823
824 prune_ref_by_self_reuse (ref);
825
826 for (prune_by = refs; prune_by; prune_by = prune_by->next)
827 {
828 if (prune_by == ref)
829 {
830 before = false;
831 continue;
832 }
833
834 if (!WRITE_CAN_USE_READ_PREFETCH
835 && ref->write_p
836 && !prune_by->write_p)
837 continue;
838 if (!READ_CAN_USE_WRITE_PREFETCH
839 && !ref->write_p
840 && prune_by->write_p)
841 continue;
842
843 prune_ref_by_group_reuse (ref, prune_by, before);
844 }
845 }
846
847 /* Prune the prefetch candidates in GROUP using the reuse analysis. */
848
849 static void
850 prune_group_by_reuse (struct mem_ref_group *group)
851 {
852 struct mem_ref *ref_pruned;
853
854 for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
855 {
856 prune_ref_by_reuse (ref_pruned, group->refs);
857
858 if (dump_file && (dump_flags & TDF_DETAILS))
859 {
860 fprintf (dump_file, "Reference %p:", (void *) ref_pruned);
861
862 if (ref_pruned->prefetch_before == PREFETCH_ALL
863 && ref_pruned->prefetch_mod == 1)
864 fprintf (dump_file, " no restrictions");
865 else if (ref_pruned->prefetch_before == 0)
866 fprintf (dump_file, " do not prefetch");
867 else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
868 fprintf (dump_file, " prefetch once");
869 else
870 {
871 if (ref_pruned->prefetch_before != PREFETCH_ALL)
872 {
873 fprintf (dump_file, " prefetch before ");
874 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
875 ref_pruned->prefetch_before);
876 }
877 if (ref_pruned->prefetch_mod != 1)
878 {
879 fprintf (dump_file, " prefetch mod ");
880 fprintf (dump_file, HOST_WIDE_INT_PRINT_DEC,
881 ref_pruned->prefetch_mod);
882 }
883 }
884 fprintf (dump_file, "\n");
885 }
886 }
887 }
888
889 /* Prune the list of prefetch candidates GROUPS using the reuse analysis. */
890
891 static void
892 prune_by_reuse (struct mem_ref_group *groups)
893 {
894 for (; groups; groups = groups->next)
895 prune_group_by_reuse (groups);
896 }
897
898 /* Returns true if we should issue prefetch for REF. */
899
900 static bool
901 should_issue_prefetch_p (struct mem_ref *ref)
902 {
903 /* For now do not issue prefetches for only first few of the
904 iterations. */
905 if (ref->prefetch_before != PREFETCH_ALL)
906 {
907 if (dump_file && (dump_flags & TDF_DETAILS))
908 fprintf (dump_file, "Ignoring %p due to prefetch_before\n",
909 (void *) ref);
910 return false;
911 }
912
913 /* Do not prefetch nontemporal stores. */
914 if (ref->storent_p)
915 {
916 if (dump_file && (dump_flags & TDF_DETAILS))
917 fprintf (dump_file, "Ignoring nontemporal store %p\n", (void *) ref);
918 return false;
919 }
920
921 return true;
922 }
923
924 /* Decide which of the prefetch candidates in GROUPS to prefetch.
925 AHEAD is the number of iterations to prefetch ahead (which corresponds
926 to the number of simultaneous instances of one prefetch running at a
927 time). UNROLL_FACTOR is the factor by that the loop is going to be
928 unrolled. Returns true if there is anything to prefetch. */
929
930 static bool
931 schedule_prefetches (struct mem_ref_group *groups, unsigned unroll_factor,
932 unsigned ahead)
933 {
934 unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
935 unsigned slots_per_prefetch;
936 struct mem_ref *ref;
937 bool any = false;
938
939 /* At most SIMULTANEOUS_PREFETCHES should be running at the same time. */
940 remaining_prefetch_slots = SIMULTANEOUS_PREFETCHES;
941
942 /* The prefetch will run for AHEAD iterations of the original loop, i.e.,
943 AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
944 it will need a prefetch slot. */
945 slots_per_prefetch = (ahead + unroll_factor / 2) / unroll_factor;
946 if (dump_file && (dump_flags & TDF_DETAILS))
947 fprintf (dump_file, "Each prefetch instruction takes %u prefetch slots.\n",
948 slots_per_prefetch);
949
950 /* For now we just take memory references one by one and issue
951 prefetches for as many as possible. The groups are sorted
952 starting with the largest step, since the references with
953 large step are more likely to cause many cache misses. */
954
955 for (; groups; groups = groups->next)
956 for (ref = groups->refs; ref; ref = ref->next)
957 {
958 if (!should_issue_prefetch_p (ref))
959 continue;
960
961 /* The loop is far from being sufficiently unrolled for this
962 prefetch. Do not generate prefetch to avoid many redudant
963 prefetches. */
964 if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
965 continue;
966
967 /* If we need to prefetch the reference each PREFETCH_MOD iterations,
968 and we unroll the loop UNROLL_FACTOR times, we need to insert
969 ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
970 iteration. */
971 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
972 / ref->prefetch_mod);
973 prefetch_slots = n_prefetches * slots_per_prefetch;
974
975 /* If more than half of the prefetches would be lost anyway, do not
976 issue the prefetch. */
977 if (2 * remaining_prefetch_slots < prefetch_slots)
978 continue;
979
980 ref->issue_prefetch_p = true;
981
982 if (remaining_prefetch_slots <= prefetch_slots)
983 return true;
984 remaining_prefetch_slots -= prefetch_slots;
985 any = true;
986 }
987
988 return any;
989 }
990
991 /* Estimate the number of prefetches in the given GROUPS. */
992
993 static int
994 estimate_prefetch_count (struct mem_ref_group *groups)
995 {
996 struct mem_ref *ref;
997 int prefetch_count = 0;
998
999 for (; groups; groups = groups->next)
1000 for (ref = groups->refs; ref; ref = ref->next)
1001 if (should_issue_prefetch_p (ref))
1002 prefetch_count++;
1003
1004 return prefetch_count;
1005 }
1006
1007 /* Issue prefetches for the reference REF into loop as decided before.
1008 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1009 is the factor by which LOOP was unrolled. */
1010
1011 static void
1012 issue_prefetch_ref (struct mem_ref *ref, unsigned unroll_factor, unsigned ahead)
1013 {
1014 HOST_WIDE_INT delta;
1015 tree addr, addr_base, write_p, local, forward;
1016 gimple prefetch;
1017 gimple_stmt_iterator bsi;
1018 unsigned n_prefetches, ap;
1019 bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1020
1021 if (dump_file && (dump_flags & TDF_DETAILS))
1022 fprintf (dump_file, "Issued%s prefetch for %p.\n",
1023 nontemporal ? " nontemporal" : "",
1024 (void *) ref);
1025
1026 bsi = gsi_for_stmt (ref->stmt);
1027
1028 n_prefetches = ((unroll_factor + ref->prefetch_mod - 1)
1029 / ref->prefetch_mod);
1030 addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1031 addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1032 true, NULL, true, GSI_SAME_STMT);
1033 write_p = ref->write_p ? integer_one_node : integer_zero_node;
1034 local = build_int_cst (integer_type_node, nontemporal ? 0 : 3);
1035
1036 for (ap = 0; ap < n_prefetches; ap++)
1037 {
1038 if (cst_and_fits_in_hwi (ref->group->step))
1039 {
1040 /* Determine the address to prefetch. */
1041 delta = (ahead + ap * ref->prefetch_mod) *
1042 int_cst_value (ref->group->step);
1043 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node,
1044 addr_base, size_int (delta));
1045 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true, NULL,
1046 true, GSI_SAME_STMT);
1047 }
1048 else
1049 {
1050 /* The step size is non-constant but loop-invariant. We use the
1051 heuristic to simply prefetch ahead iterations ahead. */
1052 forward = fold_build2 (MULT_EXPR, sizetype,
1053 fold_convert (sizetype, ref->group->step),
1054 fold_convert (sizetype, size_int (ahead)));
1055 addr = fold_build2 (POINTER_PLUS_EXPR, ptr_type_node, addr_base,
1056 forward);
1057 addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1058 NULL, true, GSI_SAME_STMT);
1059 }
1060 /* Create the prefetch instruction. */
1061 prefetch = gimple_build_call (built_in_decls[BUILT_IN_PREFETCH],
1062 3, addr, write_p, local);
1063 gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1064 }
1065 }
1066
1067 /* Issue prefetches for the references in GROUPS into loop as decided before.
1068 HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1069 factor by that LOOP was unrolled. */
1070
1071 static void
1072 issue_prefetches (struct mem_ref_group *groups,
1073 unsigned unroll_factor, unsigned ahead)
1074 {
1075 struct mem_ref *ref;
1076
1077 for (; groups; groups = groups->next)
1078 for (ref = groups->refs; ref; ref = ref->next)
1079 if (ref->issue_prefetch_p)
1080 issue_prefetch_ref (ref, unroll_factor, ahead);
1081 }
1082
1083 /* Returns true if REF is a memory write for that a nontemporal store insn
1084 can be used. */
1085
1086 static bool
1087 nontemporal_store_p (struct mem_ref *ref)
1088 {
1089 enum machine_mode mode;
1090 enum insn_code code;
1091
1092 /* REF must be a write that is not reused. We require it to be independent
1093 on all other memory references in the loop, as the nontemporal stores may
1094 be reordered with respect to other memory references. */
1095 if (!ref->write_p
1096 || !ref->independent_p
1097 || ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1098 return false;
1099
1100 /* Check that we have the storent instruction for the mode. */
1101 mode = TYPE_MODE (TREE_TYPE (ref->mem));
1102 if (mode == BLKmode)
1103 return false;
1104
1105 code = optab_handler (storent_optab, mode)->insn_code;
1106 return code != CODE_FOR_nothing;
1107 }
1108
1109 /* If REF is a nontemporal store, we mark the corresponding modify statement
1110 and return true. Otherwise, we return false. */
1111
1112 static bool
1113 mark_nontemporal_store (struct mem_ref *ref)
1114 {
1115 if (!nontemporal_store_p (ref))
1116 return false;
1117
1118 if (dump_file && (dump_flags & TDF_DETAILS))
1119 fprintf (dump_file, "Marked reference %p as a nontemporal store.\n",
1120 (void *) ref);
1121
1122 gimple_assign_set_nontemporal_move (ref->stmt, true);
1123 ref->storent_p = true;
1124
1125 return true;
1126 }
1127
1128 /* Issue a memory fence instruction after LOOP. */
1129
1130 static void
1131 emit_mfence_after_loop (struct loop *loop)
1132 {
1133 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1134 edge exit;
1135 gimple call;
1136 gimple_stmt_iterator bsi;
1137 unsigned i;
1138
1139 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1140 {
1141 call = gimple_build_call (FENCE_FOLLOWING_MOVNT, 0);
1142
1143 if (!single_pred_p (exit->dest)
1144 /* If possible, we prefer not to insert the fence on other paths
1145 in cfg. */
1146 && !(exit->flags & EDGE_ABNORMAL))
1147 split_loop_exit_edge (exit);
1148 bsi = gsi_after_labels (exit->dest);
1149
1150 gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1151 mark_virtual_ops_for_renaming (call);
1152 }
1153
1154 VEC_free (edge, heap, exits);
1155 update_ssa (TODO_update_ssa_only_virtuals);
1156 }
1157
1158 /* Returns true if we can use storent in loop, false otherwise. */
1159
1160 static bool
1161 may_use_storent_in_loop_p (struct loop *loop)
1162 {
1163 bool ret = true;
1164
1165 if (loop->inner != NULL)
1166 return false;
1167
1168 /* If we must issue a mfence insn after using storent, check that there
1169 is a suitable place for it at each of the loop exits. */
1170 if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1171 {
1172 VEC (edge, heap) *exits = get_loop_exit_edges (loop);
1173 unsigned i;
1174 edge exit;
1175
1176 for (i = 0; VEC_iterate (edge, exits, i, exit); i++)
1177 if ((exit->flags & EDGE_ABNORMAL)
1178 && exit->dest == EXIT_BLOCK_PTR)
1179 ret = false;
1180
1181 VEC_free (edge, heap, exits);
1182 }
1183
1184 return ret;
1185 }
1186
1187 /* Marks nontemporal stores in LOOP. GROUPS contains the description of memory
1188 references in the loop. */
1189
1190 static void
1191 mark_nontemporal_stores (struct loop *loop, struct mem_ref_group *groups)
1192 {
1193 struct mem_ref *ref;
1194 bool any = false;
1195
1196 if (!may_use_storent_in_loop_p (loop))
1197 return;
1198
1199 for (; groups; groups = groups->next)
1200 for (ref = groups->refs; ref; ref = ref->next)
1201 any |= mark_nontemporal_store (ref);
1202
1203 if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1204 emit_mfence_after_loop (loop);
1205 }
1206
1207 /* Determines whether we can profitably unroll LOOP FACTOR times, and if
1208 this is the case, fill in DESC by the description of number of
1209 iterations. */
1210
1211 static bool
1212 should_unroll_loop_p (struct loop *loop, struct tree_niter_desc *desc,
1213 unsigned factor)
1214 {
1215 if (!can_unroll_loop_p (loop, factor, desc))
1216 return false;
1217
1218 /* We only consider loops without control flow for unrolling. This is not
1219 a hard restriction -- tree_unroll_loop works with arbitrary loops
1220 as well; but the unrolling/prefetching is usually more profitable for
1221 loops consisting of a single basic block, and we want to limit the
1222 code growth. */
1223 if (loop->num_nodes > 2)
1224 return false;
1225
1226 return true;
1227 }
1228
1229 /* Determine the coefficient by that unroll LOOP, from the information
1230 contained in the list of memory references REFS. Description of
1231 umber of iterations of LOOP is stored to DESC. NINSNS is the number of
1232 insns of the LOOP. EST_NITER is the estimated number of iterations of
1233 the loop, or -1 if no estimate is available. */
1234
1235 static unsigned
1236 determine_unroll_factor (struct loop *loop, struct mem_ref_group *refs,
1237 unsigned ninsns, struct tree_niter_desc *desc,
1238 HOST_WIDE_INT est_niter)
1239 {
1240 unsigned upper_bound;
1241 unsigned nfactor, factor, mod_constraint;
1242 struct mem_ref_group *agp;
1243 struct mem_ref *ref;
1244
1245 /* First check whether the loop is not too large to unroll. We ignore
1246 PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1247 from unrolling them enough to make exactly one cache line covered by each
1248 iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1249 us from unrolling the loops too many times in cases where we only expect
1250 gains from better scheduling and decreasing loop overhead, which is not
1251 the case here. */
1252 upper_bound = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / ninsns;
1253
1254 /* If we unrolled the loop more times than it iterates, the unrolled version
1255 of the loop would be never entered. */
1256 if (est_niter >= 0 && est_niter < (HOST_WIDE_INT) upper_bound)
1257 upper_bound = est_niter;
1258
1259 if (upper_bound <= 1)
1260 return 1;
1261
1262 /* Choose the factor so that we may prefetch each cache just once,
1263 but bound the unrolling by UPPER_BOUND. */
1264 factor = 1;
1265 for (agp = refs; agp; agp = agp->next)
1266 for (ref = agp->refs; ref; ref = ref->next)
1267 if (should_issue_prefetch_p (ref))
1268 {
1269 mod_constraint = ref->prefetch_mod;
1270 nfactor = least_common_multiple (mod_constraint, factor);
1271 if (nfactor <= upper_bound)
1272 factor = nfactor;
1273 }
1274
1275 if (!should_unroll_loop_p (loop, desc, factor))
1276 return 1;
1277
1278 return factor;
1279 }
1280
1281 /* Returns the total volume of the memory references REFS, taking into account
1282 reuses in the innermost loop and cache line size. TODO -- we should also
1283 take into account reuses across the iterations of the loops in the loop
1284 nest. */
1285
1286 static unsigned
1287 volume_of_references (struct mem_ref_group *refs)
1288 {
1289 unsigned volume = 0;
1290 struct mem_ref_group *gr;
1291 struct mem_ref *ref;
1292
1293 for (gr = refs; gr; gr = gr->next)
1294 for (ref = gr->refs; ref; ref = ref->next)
1295 {
1296 /* Almost always reuses another value? */
1297 if (ref->prefetch_before != PREFETCH_ALL)
1298 continue;
1299
1300 /* If several iterations access the same cache line, use the size of
1301 the line divided by this number. Otherwise, a cache line is
1302 accessed in each iteration. TODO -- in the latter case, we should
1303 take the size of the reference into account, rounding it up on cache
1304 line size multiple. */
1305 volume += L1_CACHE_LINE_SIZE / ref->prefetch_mod;
1306 }
1307 return volume;
1308 }
1309
1310 /* Returns the volume of memory references accessed across VEC iterations of
1311 loops, whose sizes are described in the LOOP_SIZES array. N is the number
1312 of the loops in the nest (length of VEC and LOOP_SIZES vectors). */
1313
1314 static unsigned
1315 volume_of_dist_vector (lambda_vector vec, unsigned *loop_sizes, unsigned n)
1316 {
1317 unsigned i;
1318
1319 for (i = 0; i < n; i++)
1320 if (vec[i] != 0)
1321 break;
1322
1323 if (i == n)
1324 return 0;
1325
1326 gcc_assert (vec[i] > 0);
1327
1328 /* We ignore the parts of the distance vector in subloops, since usually
1329 the numbers of iterations are much smaller. */
1330 return loop_sizes[i] * vec[i];
1331 }
1332
1333 /* Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE
1334 at the position corresponding to the loop of the step. N is the depth
1335 of the considered loop nest, and, LOOP is its innermost loop. */
1336
1337 static void
1338 add_subscript_strides (tree access_fn, unsigned stride,
1339 HOST_WIDE_INT *strides, unsigned n, struct loop *loop)
1340 {
1341 struct loop *aloop;
1342 tree step;
1343 HOST_WIDE_INT astep;
1344 unsigned min_depth = loop_depth (loop) - n;
1345
1346 while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1347 {
1348 aloop = get_chrec_loop (access_fn);
1349 step = CHREC_RIGHT (access_fn);
1350 access_fn = CHREC_LEFT (access_fn);
1351
1352 if ((unsigned) loop_depth (aloop) <= min_depth)
1353 continue;
1354
1355 if (host_integerp (step, 0))
1356 astep = tree_low_cst (step, 0);
1357 else
1358 astep = L1_CACHE_LINE_SIZE;
1359
1360 strides[n - 1 - loop_depth (loop) + loop_depth (aloop)] += astep * stride;
1361
1362 }
1363 }
1364
1365 /* Returns the volume of memory references accessed between two consecutive
1366 self-reuses of the reference DR. We consider the subscripts of DR in N
1367 loops, and LOOP_SIZES contains the volumes of accesses in each of the
1368 loops. LOOP is the innermost loop of the current loop nest. */
1369
1370 static unsigned
1371 self_reuse_distance (data_reference_p dr, unsigned *loop_sizes, unsigned n,
1372 struct loop *loop)
1373 {
1374 tree stride, access_fn;
1375 HOST_WIDE_INT *strides, astride;
1376 VEC (tree, heap) *access_fns;
1377 tree ref = DR_REF (dr);
1378 unsigned i, ret = ~0u;
1379
1380 /* In the following example:
1381
1382 for (i = 0; i < N; i++)
1383 for (j = 0; j < N; j++)
1384 use (a[j][i]);
1385 the same cache line is accessed each N steps (except if the change from
1386 i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1387 we cannot rely purely on the results of the data dependence analysis.
1388
1389 Instead, we compute the stride of the reference in each loop, and consider
1390 the innermost loop in that the stride is less than cache size. */
1391
1392 strides = XCNEWVEC (HOST_WIDE_INT, n);
1393 access_fns = DR_ACCESS_FNS (dr);
1394
1395 for (i = 0; VEC_iterate (tree, access_fns, i, access_fn); i++)
1396 {
1397 /* Keep track of the reference corresponding to the subscript, so that we
1398 know its stride. */
1399 while (handled_component_p (ref) && TREE_CODE (ref) != ARRAY_REF)
1400 ref = TREE_OPERAND (ref, 0);
1401
1402 if (TREE_CODE (ref) == ARRAY_REF)
1403 {
1404 stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1405 if (host_integerp (stride, 1))
1406 astride = tree_low_cst (stride, 1);
1407 else
1408 astride = L1_CACHE_LINE_SIZE;
1409
1410 ref = TREE_OPERAND (ref, 0);
1411 }
1412 else
1413 astride = 1;
1414
1415 add_subscript_strides (access_fn, astride, strides, n, loop);
1416 }
1417
1418 for (i = n; i-- > 0; )
1419 {
1420 unsigned HOST_WIDE_INT s;
1421
1422 s = strides[i] < 0 ? -strides[i] : strides[i];
1423
1424 if (s < (unsigned) L1_CACHE_LINE_SIZE
1425 && (loop_sizes[i]
1426 > (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1427 {
1428 ret = loop_sizes[i];
1429 break;
1430 }
1431 }
1432
1433 free (strides);
1434 return ret;
1435 }
1436
1437 /* Determines the distance till the first reuse of each reference in REFS
1438 in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1439 memory references in the loop. */
1440
1441 static void
1442 determine_loop_nest_reuse (struct loop *loop, struct mem_ref_group *refs,
1443 bool no_other_refs)
1444 {
1445 struct loop *nest, *aloop;
1446 VEC (data_reference_p, heap) *datarefs = NULL;
1447 VEC (ddr_p, heap) *dependences = NULL;
1448 struct mem_ref_group *gr;
1449 struct mem_ref *ref, *refb;
1450 VEC (loop_p, heap) *vloops = NULL;
1451 unsigned *loop_data_size;
1452 unsigned i, j, n;
1453 unsigned volume, dist, adist;
1454 HOST_WIDE_INT vol;
1455 data_reference_p dr;
1456 ddr_p dep;
1457
1458 if (loop->inner)
1459 return;
1460
1461 /* Find the outermost loop of the loop nest of loop (we require that
1462 there are no sibling loops inside the nest). */
1463 nest = loop;
1464 while (1)
1465 {
1466 aloop = loop_outer (nest);
1467
1468 if (aloop == current_loops->tree_root
1469 || aloop->inner->next)
1470 break;
1471
1472 nest = aloop;
1473 }
1474
1475 /* For each loop, determine the amount of data accessed in each iteration.
1476 We use this to estimate whether the reference is evicted from the
1477 cache before its reuse. */
1478 find_loop_nest (nest, &vloops);
1479 n = VEC_length (loop_p, vloops);
1480 loop_data_size = XNEWVEC (unsigned, n);
1481 volume = volume_of_references (refs);
1482 i = n;
1483 while (i-- != 0)
1484 {
1485 loop_data_size[i] = volume;
1486 /* Bound the volume by the L2 cache size, since above this bound,
1487 all dependence distances are equivalent. */
1488 if (volume > L2_CACHE_SIZE_BYTES)
1489 continue;
1490
1491 aloop = VEC_index (loop_p, vloops, i);
1492 vol = estimated_loop_iterations_int (aloop, false);
1493 if (vol < 0)
1494 vol = expected_loop_iterations (aloop);
1495 volume *= vol;
1496 }
1497
1498 /* Prepare the references in the form suitable for data dependence
1499 analysis. We ignore unanalyzable data references (the results
1500 are used just as a heuristics to estimate temporality of the
1501 references, hence we do not need to worry about correctness). */
1502 for (gr = refs; gr; gr = gr->next)
1503 for (ref = gr->refs; ref; ref = ref->next)
1504 {
1505 dr = create_data_ref (nest, ref->mem, ref->stmt, !ref->write_p);
1506
1507 if (dr)
1508 {
1509 ref->reuse_distance = volume;
1510 dr->aux = ref;
1511 VEC_safe_push (data_reference_p, heap, datarefs, dr);
1512 }
1513 else
1514 no_other_refs = false;
1515 }
1516
1517 for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++)
1518 {
1519 dist = self_reuse_distance (dr, loop_data_size, n, loop);
1520 ref = (struct mem_ref *) dr->aux;
1521 if (ref->reuse_distance > dist)
1522 ref->reuse_distance = dist;
1523
1524 if (no_other_refs)
1525 ref->independent_p = true;
1526 }
1527
1528 compute_all_dependences (datarefs, &dependences, vloops, true);
1529
1530 for (i = 0; VEC_iterate (ddr_p, dependences, i, dep); i++)
1531 {
1532 if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1533 continue;
1534
1535 ref = (struct mem_ref *) DDR_A (dep)->aux;
1536 refb = (struct mem_ref *) DDR_B (dep)->aux;
1537
1538 if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1539 || DDR_NUM_DIST_VECTS (dep) == 0)
1540 {
1541 /* If the dependence cannot be analyzed, assume that there might be
1542 a reuse. */
1543 dist = 0;
1544
1545 ref->independent_p = false;
1546 refb->independent_p = false;
1547 }
1548 else
1549 {
1550 /* The distance vectors are normalized to be always lexicographically
1551 positive, hence we cannot tell just from them whether DDR_A comes
1552 before DDR_B or vice versa. However, it is not important,
1553 anyway -- if DDR_A is close to DDR_B, then it is either reused in
1554 DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1555 in cache (and marking it as nontemporal would not affect
1556 anything). */
1557
1558 dist = volume;
1559 for (j = 0; j < DDR_NUM_DIST_VECTS (dep); j++)
1560 {
1561 adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1562 loop_data_size, n);
1563
1564 /* If this is a dependence in the innermost loop (i.e., the
1565 distances in all superloops are zero) and it is not
1566 the trivial self-dependence with distance zero, record that
1567 the references are not completely independent. */
1568 if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), n - 1)
1569 && (ref != refb
1570 || DDR_DIST_VECT (dep, j)[n-1] != 0))
1571 {
1572 ref->independent_p = false;
1573 refb->independent_p = false;
1574 }
1575
1576 /* Ignore accesses closer than
1577 L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1578 so that we use nontemporal prefetches e.g. if single memory
1579 location is accessed several times in a single iteration of
1580 the loop. */
1581 if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1582 continue;
1583
1584 if (adist < dist)
1585 dist = adist;
1586 }
1587 }
1588
1589 if (ref->reuse_distance > dist)
1590 ref->reuse_distance = dist;
1591 if (refb->reuse_distance > dist)
1592 refb->reuse_distance = dist;
1593 }
1594
1595 free_dependence_relations (dependences);
1596 free_data_refs (datarefs);
1597 free (loop_data_size);
1598
1599 if (dump_file && (dump_flags & TDF_DETAILS))
1600 {
1601 fprintf (dump_file, "Reuse distances:\n");
1602 for (gr = refs; gr; gr = gr->next)
1603 for (ref = gr->refs; ref; ref = ref->next)
1604 fprintf (dump_file, " ref %p distance %u\n",
1605 (void *) ref, ref->reuse_distance);
1606 }
1607 }
1608
1609 /* Do a cost-benefit analysis to determine if prefetching is profitable
1610 for the current loop given the following parameters:
1611 AHEAD: the iteration ahead distance,
1612 EST_NITER: the estimated trip count,
1613 NINSNS: estimated number of instructions in the loop,
1614 PREFETCH_COUNT: an estimate of the number of prefetches
1615 MEM_REF_COUNT: total number of memory references in the loop. */
1616
1617 static bool
1618 is_loop_prefetching_profitable (unsigned ahead, HOST_WIDE_INT est_niter,
1619 unsigned ninsns, unsigned prefetch_count,
1620 unsigned mem_ref_count, unsigned unroll_factor)
1621 {
1622 int insn_to_mem_ratio, insn_to_prefetch_ratio;
1623
1624 if (mem_ref_count == 0)
1625 return false;
1626
1627 /* Prefetching improves performance by overlapping cache missing
1628 memory accesses with CPU operations. If the loop does not have
1629 enough CPU operations to overlap with memory operations, prefetching
1630 won't give a significant benefit. One approximate way of checking
1631 this is to require the ratio of instructions to memory references to
1632 be above a certain limit. This approximation works well in practice.
1633 TODO: Implement a more precise computation by estimating the time
1634 for each CPU or memory op in the loop. Time estimates for memory ops
1635 should account for cache misses. */
1636 insn_to_mem_ratio = ninsns / mem_ref_count;
1637
1638 if (insn_to_mem_ratio < PREFETCH_MIN_INSN_TO_MEM_RATIO)
1639 {
1640 if (dump_file && (dump_flags & TDF_DETAILS))
1641 fprintf (dump_file,
1642 "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1643 insn_to_mem_ratio);
1644 return false;
1645 }
1646
1647 /* Prefetching most likely causes performance degradation when the instruction
1648 to prefetch ratio is too small. Too many prefetch instructions in a loop
1649 may reduce the I-cache performance.
1650 (unroll_factor * ninsns) is used to estimate the number of instructions in
1651 the unrolled loop. This implementation is a bit simplistic -- the number
1652 of issued prefetch instructions is also affected by unrolling. So,
1653 prefetch_mod and the unroll factor should be taken into account when
1654 determining prefetch_count. Also, the number of insns of the unrolled
1655 loop will usually be significantly smaller than the number of insns of the
1656 original loop * unroll_factor (at least the induction variable increases
1657 and the exit branches will get eliminated), so it might be better to use
1658 tree_estimate_loop_size + estimated_unrolled_size. */
1659 insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1660 if (insn_to_prefetch_ratio < MIN_INSN_TO_PREFETCH_RATIO)
1661 {
1662 if (dump_file && (dump_flags & TDF_DETAILS))
1663 fprintf (dump_file,
1664 "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1665 insn_to_prefetch_ratio);
1666 return false;
1667 }
1668
1669 /* Could not do further estimation if the trip count is unknown. Just assume
1670 prefetching is profitable. Too aggressive??? */
1671 if (est_niter < 0)
1672 return true;
1673
1674 if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1675 {
1676 if (dump_file && (dump_flags & TDF_DETAILS))
1677 fprintf (dump_file,
1678 "Not prefetching -- loop estimated to roll only %d times\n",
1679 (int) est_niter);
1680 return false;
1681 }
1682 return true;
1683 }
1684
1685
1686 /* Issue prefetch instructions for array references in LOOP. Returns
1687 true if the LOOP was unrolled. */
1688
1689 static bool
1690 loop_prefetch_arrays (struct loop *loop)
1691 {
1692 struct mem_ref_group *refs;
1693 unsigned ahead, ninsns, time, unroll_factor;
1694 HOST_WIDE_INT est_niter;
1695 struct tree_niter_desc desc;
1696 bool unrolled = false, no_other_refs;
1697 unsigned prefetch_count;
1698 unsigned mem_ref_count;
1699
1700 if (optimize_loop_nest_for_size_p (loop))
1701 {
1702 if (dump_file && (dump_flags & TDF_DETAILS))
1703 fprintf (dump_file, " ignored (cold area)\n");
1704 return false;
1705 }
1706
1707 /* Step 1: gather the memory references. */
1708 refs = gather_memory_references (loop, &no_other_refs, &mem_ref_count);
1709
1710 /* Step 2: estimate the reuse effects. */
1711 prune_by_reuse (refs);
1712
1713 prefetch_count = estimate_prefetch_count (refs);
1714 if (prefetch_count == 0)
1715 goto fail;
1716
1717 determine_loop_nest_reuse (loop, refs, no_other_refs);
1718
1719 /* Step 3: determine the ahead and unroll factor. */
1720
1721 /* FIXME: the time should be weighted by the probabilities of the blocks in
1722 the loop body. */
1723 time = tree_num_loop_insns (loop, &eni_time_weights);
1724 ahead = (PREFETCH_LATENCY + time - 1) / time;
1725 est_niter = estimated_loop_iterations_int (loop, false);
1726
1727 ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1728 unroll_factor = determine_unroll_factor (loop, refs, ninsns, &desc,
1729 est_niter);
1730 if (dump_file && (dump_flags & TDF_DETAILS))
1731 fprintf (dump_file, "Ahead %d, unroll factor %d, trip count "
1732 HOST_WIDE_INT_PRINT_DEC "\n"
1733 "insn count %d, mem ref count %d, prefetch count %d\n",
1734 ahead, unroll_factor, est_niter,
1735 ninsns, mem_ref_count, prefetch_count);
1736
1737 if (!is_loop_prefetching_profitable (ahead, est_niter, ninsns, prefetch_count,
1738 mem_ref_count, unroll_factor))
1739 goto fail;
1740
1741 mark_nontemporal_stores (loop, refs);
1742
1743 /* Step 4: what to prefetch? */
1744 if (!schedule_prefetches (refs, unroll_factor, ahead))
1745 goto fail;
1746
1747 /* Step 5: unroll the loop. TODO -- peeling of first and last few
1748 iterations so that we do not issue superfluous prefetches. */
1749 if (unroll_factor != 1)
1750 {
1751 tree_unroll_loop (loop, unroll_factor,
1752 single_dom_exit (loop), &desc);
1753 unrolled = true;
1754 }
1755
1756 /* Step 6: issue the prefetches. */
1757 issue_prefetches (refs, unroll_factor, ahead);
1758
1759 fail:
1760 release_mem_refs (refs);
1761 return unrolled;
1762 }
1763
1764 /* Issue prefetch instructions for array references in loops. */
1765
1766 unsigned int
1767 tree_ssa_prefetch_arrays (void)
1768 {
1769 loop_iterator li;
1770 struct loop *loop;
1771 bool unrolled = false;
1772 int todo_flags = 0;
1773
1774 if (!HAVE_prefetch
1775 /* It is possible to ask compiler for say -mtune=i486 -march=pentium4.
1776 -mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1777 of processor costs and i486 does not have prefetch, but
1778 -march=pentium4 causes HAVE_prefetch to be true. Ugh. */
1779 || PREFETCH_BLOCK == 0)
1780 return 0;
1781
1782 if (dump_file && (dump_flags & TDF_DETAILS))
1783 {
1784 fprintf (dump_file, "Prefetching parameters:\n");
1785 fprintf (dump_file, " simultaneous prefetches: %d\n",
1786 SIMULTANEOUS_PREFETCHES);
1787 fprintf (dump_file, " prefetch latency: %d\n", PREFETCH_LATENCY);
1788 fprintf (dump_file, " prefetch block size: %d\n", PREFETCH_BLOCK);
1789 fprintf (dump_file, " L1 cache size: %d lines, %d kB\n",
1790 L1_CACHE_SIZE_BYTES / L1_CACHE_LINE_SIZE, L1_CACHE_SIZE);
1791 fprintf (dump_file, " L1 cache line size: %d\n", L1_CACHE_LINE_SIZE);
1792 fprintf (dump_file, " L2 cache size: %d kB\n", L2_CACHE_SIZE);
1793 fprintf (dump_file, " min insn-to-prefetch ratio: %d \n",
1794 MIN_INSN_TO_PREFETCH_RATIO);
1795 fprintf (dump_file, " min insn-to-mem ratio: %d \n",
1796 PREFETCH_MIN_INSN_TO_MEM_RATIO);
1797 fprintf (dump_file, "\n");
1798 }
1799
1800 initialize_original_copy_tables ();
1801
1802 if (!built_in_decls[BUILT_IN_PREFETCH])
1803 {
1804 tree type = build_function_type (void_type_node,
1805 tree_cons (NULL_TREE,
1806 const_ptr_type_node,
1807 NULL_TREE));
1808 tree decl = add_builtin_function ("__builtin_prefetch", type,
1809 BUILT_IN_PREFETCH, BUILT_IN_NORMAL,
1810 NULL, NULL_TREE);
1811 DECL_IS_NOVOPS (decl) = true;
1812 built_in_decls[BUILT_IN_PREFETCH] = decl;
1813 }
1814
1815 /* We assume that size of cache line is a power of two, so verify this
1816 here. */
1817 gcc_assert ((PREFETCH_BLOCK & (PREFETCH_BLOCK - 1)) == 0);
1818
1819 FOR_EACH_LOOP (li, loop, LI_FROM_INNERMOST)
1820 {
1821 if (dump_file && (dump_flags & TDF_DETAILS))
1822 fprintf (dump_file, "Processing loop %d:\n", loop->num);
1823
1824 unrolled |= loop_prefetch_arrays (loop);
1825
1826 if (dump_file && (dump_flags & TDF_DETAILS))
1827 fprintf (dump_file, "\n\n");
1828 }
1829
1830 if (unrolled)
1831 {
1832 scev_reset ();
1833 todo_flags |= TODO_cleanup_cfg;
1834 }
1835
1836 free_original_copy_tables ();
1837 return todo_flags;
1838 }