tree-ssa-loop-prefetch.cc source code [gcc/tree-ssa-loop-prefetch.cc]

1	/ Array prefetching.*
2	Copyright (C) 2005-2023 Free Software Foundation, Inc.
3
4	This file is part of GCC.
5
6	GCC is free software; you can redistribute it and/or modify it
7	under the terms of the GNU General Public License as published by the
8	Free Software Foundation; either version 3, or (at your option) any
9	later version.
10
11	GCC is distributed in the hope that it will be useful, but WITHOUT
12	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13	FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14	for more details.
15
16	You should have received a copy of the GNU General Public License
17	along with GCC; see the file COPYING3. If not see
18	<http://www.gnu.org/licenses/>. /*
19
20	#include "config.h"
21	#include "system.h"
22	#include "coretypes.h"
23	#include "backend.h"
24	#include "target.h"
25	#include "rtl.h"
26	#include "tree.h"
27	#include "gimple.h"
28	#include "predict.h"
29	#include "tree-pass.h"
30	#include "gimple-ssa.h"
31	#include "optabs-query.h"
32	#include "tree-pretty-print.h"
33	#include "fold-const.h"
34	#include "stor-layout.h"
35	#include "gimplify.h"
36	#include "gimple-iterator.h"
37	#include "gimplify-me.h"
38	#include "tree-ssa-loop-ivopts.h"
39	#include "tree-ssa-loop-manip.h"
40	#include "tree-ssa-loop-niter.h"
41	#include "tree-ssa-loop.h"
42	#include "ssa.h"
43	#include "tree-into-ssa.h"
44	#include "cfgloop.h"
45	#include "tree-scalar-evolution.h"
46	#include "langhooks.h"
47	#include "tree-inline.h"
48	#include "tree-data-ref.h"
49	#include "diagnostic-core.h"
50	#include "dbgcnt.h"
51
52	/ This pass inserts prefetch instructions to optimize cache usage during*
53	accesses to arrays in loops. It processes loops sequentially and:
54
55	1) Gathers all memory references in the single loop.
56	2) For each of the references it decides when it is profitable to prefetch
57	it. To do it, we evaluate the reuse among the accesses, and determines
58	two values: PREFETCH_BEFORE (meaning that it only makes sense to do
59	prefetching in the first PREFETCH_BEFORE iterations of the loop) and
60	PREFETCH_MOD (meaning that it only makes sense to prefetch in the
61	iterations of the loop that are zero modulo PREFETCH_MOD). For example
62	(assuming cache line size is 64 bytes, char has size 1 byte and there
63	is no hardware sequential prefetch):
64
65	char a;*
66	for (i = 0; i < max; i++)
67	{
68	a[255] = ...; (0)
69	a[i] = ...; (1)
70	a[i + 64] = ...; (2)
71	a[16i] = ...; (3)*
72	a[187i] = ...; (4)*
73	a[187i + 50] = ...; (5)*
74	}
75
76	(0) obviously has PREFETCH_BEFORE 1
77	(1) has PREFETCH_BEFORE 64, since (2) accesses the same memory
78	location 64 iterations before it, and PREFETCH_MOD 64 (since
79	it hits the same cache line otherwise).
80	(2) has PREFETCH_MOD 64
81	(3) has PREFETCH_MOD 4
82	(4) has PREFETCH_MOD 1. We do not set PREFETCH_BEFORE here, since
83	the cache line accessed by (5) is the same with probability only
84	7/32.
85	(5) has PREFETCH_MOD 1 as well.
86
87	Additionally, we use data dependence analysis to determine for each
88	reference the distance till the first reuse; this information is used
89	to determine the temporality of the issued prefetch instruction.
90
91	3) We determine how much ahead we need to prefetch. The number of
92	iterations needed is time to fetch / time spent in one iteration of
93	the loop. The problem is that we do not know either of these values,
94	so we just make a heuristic guess based on a magic (possibly)
95	target-specific constant and size of the loop.
96
97	4) Determine which of the references we prefetch. We take into account
98	that there is a maximum number of simultaneous prefetches (provided
99	by machine description). We prefetch as many prefetches as possible
100	while still within this bound (starting with those with lowest
101	prefetch_mod, since they are responsible for most of the cache
102	misses).
103
104	5) We unroll and peel loops so that we are able to satisfy PREFETCH_MOD
105	and PREFETCH_BEFORE requirements (within some bounds), and to avoid
106	prefetching nonaccessed memory.
107	TODO -- actually implement peeling.
108
109	6) We actually emit the prefetch instructions. ??? Perhaps emit the
110	prefetch instructions with guards in cases where 5) was not sufficient
111	to satisfy the constraints?
112
113	A cost model is implemented to determine whether or not prefetching is
114	profitable for a given loop. The cost model has three heuristics:
115
116	1. Function trip_count_to_ahead_ratio_too_small_p implements a
117	heuristic that determines whether or not the loop has too few
118	iterations (compared to ahead). Prefetching is not likely to be
119	beneficial if the trip count to ahead ratio is below a certain
120	minimum.
121
122	2. Function mem_ref_count_reasonable_p implements a heuristic that
123	determines whether the given loop has enough CPU ops that can be
124	overlapped with cache missing memory ops. If not, the loop
125	won't benefit from prefetching. In the implementation,
126	prefetching is not considered beneficial if the ratio between
127	the instruction count and the mem ref count is below a certain
128	minimum.
129
130	3. Function insn_to_prefetch_ratio_too_small_p implements a
131	heuristic that disables prefetching in a loop if the prefetching
132	cost is above a certain limit. The relative prefetching cost is
133	estimated by taking the ratio between the prefetch count and the
134	total intruction count (this models the I-cache cost).
135
136	The limits used in these heuristics are defined as parameters with
137	reasonable default values. Machine-specific default values will be
138	added later.
139
140	Some other TODO:
141	-- write and use more general reuse analysis (that could be also used
142	in other cache aimed loop optimizations)
143	-- make it behave sanely together with the prefetches given by user
144	(now we just ignore them; at the very least we should avoid
145	optimizing loops in that user put his own prefetches)
146	-- we assume cache line size alignment of arrays; this could be
147	improved. /*
148
149	/ Magic constants follow. These should be replaced by machine specific*
150	numbers. /*
151
152	/ True if write can be prefetched by a read prefetch. /
153
154	#ifndef WRITE_CAN_USE_READ_PREFETCH
155	#define WRITE_CAN_USE_READ_PREFETCH 1
156	#endif
157
158	/ True if read can be prefetched by a write prefetch. /
159
160	#ifndef READ_CAN_USE_WRITE_PREFETCH
161	#define READ_CAN_USE_WRITE_PREFETCH 0
162	#endif
163
164	/ The size of the block loaded by a single prefetch. Usually, this is*
165	the same as cache line size (at the moment, we only consider one level
166	of cache hierarchy). /*
167
168	#ifndef PREFETCH_BLOCK
169	#define PREFETCH_BLOCK param_l1_cache_line_size
170	#endif
171
172	/ Do we have a forward hardware sequential prefetching? /
173
174	#ifndef HAVE_FORWARD_PREFETCH
175	#define HAVE_FORWARD_PREFETCH 0
176	#endif
177
178	/ Do we have a backward hardware sequential prefetching? /
179
180	#ifndef HAVE_BACKWARD_PREFETCH
181	#define HAVE_BACKWARD_PREFETCH 0
182	#endif
183
184	/ In some cases we are only able to determine that there is a certain*
185	probability that the two accesses hit the same cache line. In this
186	case, we issue the prefetches for both of them if this probability
187	is less then (1000 - ACCEPTABLE_MISS_RATE) per thousand. /*
188
189	#ifndef ACCEPTABLE_MISS_RATE
190	#define ACCEPTABLE_MISS_RATE 50
191	#endif
192
193	#define L1_CACHE_SIZE_BYTES ((unsigned) (param_l1_cache_size * 1024))
194	#define L2_CACHE_SIZE_BYTES ((unsigned) (param_l2_cache_size * 1024))
195
196	/ We consider a memory access nontemporal if it is not reused sooner than*
197	after L2_CACHE_SIZE_BYTES of memory are accessed. However, we ignore
198	accesses closer than L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
199	so that we use nontemporal prefetches e.g. if single memory location
200	is accessed several times in a single iteration of the loop. /*
201	#define NONTEMPORAL_FRACTION 16
202
203	/ In case we have to emit a memory fence instruction after the loop that*
204	uses nontemporal stores, this defines the builtin to use. /*
205
206	#ifndef FENCE_FOLLOWING_MOVNT
207	#define FENCE_FOLLOWING_MOVNT NULL_TREE
208	#endif
209
210	/ It is not profitable to prefetch when the trip count is not at*
211	least TRIP_COUNT_TO_AHEAD_RATIO times the prefetch ahead distance.
212	For example, in a loop with a prefetch ahead distance of 10,
213	supposing that TRIP_COUNT_TO_AHEAD_RATIO is equal to 4, it is
214	profitable to prefetch when the trip count is greater or equal to
215	40. In that case, 30 out of the 40 iterations will benefit from
216	prefetching. /*
217
218	#ifndef TRIP_COUNT_TO_AHEAD_RATIO
219	#define TRIP_COUNT_TO_AHEAD_RATIO 4
220	#endif
221
222	/ The group of references between that reuse may occur. /
223
224	struct mem_ref_group
225	{
226	tree base; / Base of the reference. /
227	tree step; / Step of the reference. /
228	struct mem_ref refs; /* References in the group. /
229	struct mem_ref_group next; /* Next group of references. /
230	unsigned int uid; / Group UID, used only for debugging. /
231	};
232
233	/ Assigned to PREFETCH_BEFORE when all iterations are to be prefetched. /
234
235	#define PREFETCH_ALL HOST_WIDE_INT_M1U
236
237	/ Do not generate a prefetch if the unroll factor is significantly less*
238	than what is required by the prefetch. This is to avoid redundant
239	prefetches. For example, when prefetch_mod is 16 and unroll_factor is
240	2, prefetching requires unrolling the loop 16 times, but
241	the loop is actually unrolled twice. In this case (ratio = 8),
242	prefetching is not likely to be beneficial. /*
243
244	#ifndef PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO
245	#define PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO 4
246	#endif
247
248	/ Some of the prefetch computations have quadratic complexity. We want to*
249	avoid huge compile times and, therefore, want to limit the amount of
250	memory references per loop where we consider prefetching. /*
251
252	#ifndef PREFETCH_MAX_MEM_REFS_PER_LOOP
253	#define PREFETCH_MAX_MEM_REFS_PER_LOOP 200
254	#endif
255
256	/ The memory reference. /
257
258	struct mem_ref
259	{
260	gimple stmt; /* Statement in that the reference appears. /
261	tree mem; / The reference. /
262	HOST_WIDE_INT delta; / Constant offset of the reference. /
263	struct mem_ref_group group; /* The group of references it belongs to. /
264	unsigned HOST_WIDE_INT prefetch_mod;
265	/ Prefetch only each PREFETCH_MOD-th*
266	iteration. /*
267	unsigned HOST_WIDE_INT prefetch_before;
268	/ Prefetch only first PREFETCH_BEFORE*
269	iterations. /*
270	unsigned reuse_distance; / The amount of data accessed before the first*
271	reuse of this value. /*
272	struct mem_ref next; /* The next reference in the group. /
273	unsigned int uid; / Ref UID, used only for debugging. /
274	unsigned write_p : `1`; / Is it a write? /
275	unsigned independent_p : `1`; / True if the reference is independent on*
276	all other references inside the loop. /*
277	unsigned issue_prefetch_p : `1`; / Should we really issue the prefetch? /
278	unsigned storent_p : `1`; / True if we changed the store to a*
279	nontemporal one. /*
280	};
281
282	/ Dumps information about memory reference /
283	static void
284	dump_mem_details (FILE *file, tree base, tree step,
285	HOST_WIDE_INT delta, bool write_p)
286	{
287	fprintf (stream: file, format: "(base ");
288	print_generic_expr (file, base, TDF_SLIM);
289	fprintf (stream: file, format: ", step ");
290	if (cst_and_fits_in_hwi (step))
291	fprintf (stream: file, HOST_WIDE_INT_PRINT_DEC, int_cst_value (step));
292	else
293	print_generic_expr (file, step, TDF_SLIM);
294	fprintf (stream: file, format: ")\n");
295	fprintf (stream: file, format: " delta " HOST_WIDE_INT_PRINT_DEC "\n", delta);
296	fprintf (stream: file, format: " %s\n\n", write_p ? "write" : "read");
297	}
298
299	/ Dumps information about reference REF to FILE. /
300
301	static void
302	dump_mem_ref (FILE file, struct* mem_ref *ref)
303	{
304	fprintf (stream: file, format: "reference %u:%u (", ref->group->uid, ref->uid);
305	print_generic_expr (file, ref->mem, TDF_SLIM);
306	fprintf (stream: file, format: ")\n");
307	}
308
309	/ Finds a group with BASE and STEP in GROUPS, or creates one if it does not*
310	exist. /*
311
312	static struct mem_ref_group *
313	find_or_create_group (struct mem_ref_group **groups, tree base, tree step)
314	{
315	/ Global count for setting struct mem_ref_group->uid. /
316	static unsigned int last_mem_ref_group_uid = `0`;
317
318	struct mem_ref_group *group;
319
320	for (; groups; groups = &(groups)->next)
321	{
322	if (operand_equal_p ((*groups)->step, step, flags: `0`)
323	&& operand_equal_p ((*groups)->base, base, flags: `0`))
324	return *groups;
325
326	/ If step is an integer constant, keep the list of groups sorted*
327	by decreasing step. /*
328	if (cst_and_fits_in_hwi ((*groups)->step) && cst_and_fits_in_hwi (step)
329	&& int_cst_value ((*groups)->step) < int_cst_value (step))
330	break;
331	}
332
333	group = XNEW (struct mem_ref_group);
334	group->base = base;
335	group->step = step;
336	group->refs = NULL;
337	group->uid = ++last_mem_ref_group_uid;
338	group->next = *groups;
339	*groups = group;
340
341	return group;
342	}
343
344	/ Records a memory reference MEM in GROUP with offset DELTA and write status*
345	WRITE_P. The reference occurs in statement STMT. /*
346
347	static void
348	record_ref (struct mem_ref_group group, gimple stmt, tree mem,
349	HOST_WIDE_INT delta, bool write_p)
350	{
351	unsigned int last_mem_ref_uid = `0`;
352	struct mem_ref **aref;
353
354	/ Do not record the same address twice. /
355	for (aref = &group->refs; aref; aref = &(aref)->next)
356	{
357	last_mem_ref_uid = (*aref)->uid;
358
359	/ It does not have to be possible for write reference to reuse the read*
360	prefetch, or vice versa. /*
361	if (!WRITE_CAN_USE_READ_PREFETCH
362	&& write_p
363	&& !(*aref)->write_p)
364	continue;
365	if (!READ_CAN_USE_WRITE_PREFETCH
366	&& !write_p
367	&& (*aref)->write_p)
368	continue;
369
370	if ((*aref)->delta == delta)
371	return;
372	}
373
374	(aref) = XNEW (struct* mem_ref);
375	(*aref)->stmt = stmt;
376	(*aref)->mem = mem;
377	(*aref)->delta = delta;
378	(*aref)->write_p = write_p;
379	(*aref)->prefetch_before = PREFETCH_ALL;
380	(*aref)->prefetch_mod = `1`;
381	(*aref)->reuse_distance = `0`;
382	(aref)->issue_prefetch_p = false*;
383	(*aref)->group = group;
384	(*aref)->next = NULL;
385	(aref)->independent_p = false*;
386	(aref)->storent_p = false*;
387	(*aref)->uid = last_mem_ref_uid + `1`;
388
389	if (dump_file && (dump_flags & TDF_DETAILS))
390	{
391	dump_mem_ref (file: dump_file, ref: *aref);
392
393	fprintf (stream: dump_file, format: " group %u ", group->uid);
394	dump_mem_details (file: dump_file, base: group->base, step: group->step, delta,
395	write_p);
396	}
397	}
398
399	/ Release memory references in GROUPS. /
400
401	static void
402	release_mem_refs (struct mem_ref_group *groups)
403	{
404	struct mem_ref_group *next_g;
405	struct mem_ref ref, next_r;
406
407	for (; groups; groups = next_g)
408	{
409	next_g = groups->next;
410	for (ref = groups->refs; ref; ref = next_r)
411	{
412	next_r = ref->next;
413	free (ptr: ref);
414	}
415	free (ptr: groups);
416	}
417	}
418
419	/ A structure used to pass arguments to idx_analyze_ref. /
420
421	struct ar_data
422	{
423	class loop loop; /* Loop of the reference. /
424	gimple stmt; /* Statement of the reference. /
425	tree step; /* Step of the memory reference. /
426	HOST_WIDE_INT delta; /* Offset of the memory reference. /
427	};
428
429	/ Analyzes a single INDEX of a memory reference to obtain information*
430	described at analyze_ref. Callback for for_each_index. /*
431
432	static bool
433	idx_analyze_ref (tree base, tree index, void* *data)
434	{
435	struct ar_data ar_data = (struct* ar_data *) data;
436	tree ibase, step, stepsize;
437	HOST_WIDE_INT idelta = `0`, imult = `1`;
438	affine_iv iv;
439
440	if (!simple_iv (ar_data->loop, loop_containing_stmt (stmt: ar_data->stmt),
441	index, &iv, true*))
442	return false;
443	ibase = iv.base;
444	step = iv.step;
445
446	if (TREE_CODE (ibase) == POINTER_PLUS_EXPR
447	&& cst_and_fits_in_hwi (TREE_OPERAND (ibase, `1`)))
448	{
449	idelta = int_cst_value (TREE_OPERAND (ibase, `1`));
450	ibase = TREE_OPERAND (ibase, `0`);
451	}
452	if (cst_and_fits_in_hwi (ibase))
453	{
454	idelta += int_cst_value (ibase);
455	ibase = build_int_cst (TREE_TYPE (ibase), `0`);
456	}
457
458	if (TREE_CODE (base) == ARRAY_REF)
459	{
460	stepsize = array_ref_element_size (base);
461	if (!cst_and_fits_in_hwi (stepsize))
462	return false;
463	imult = int_cst_value (stepsize);
464	step = fold_build2 (MULT_EXPR, sizetype,
465	fold_convert (sizetype, step),
466	fold_convert (sizetype, stepsize));
467	idelta *= imult;
468	}
469
470	if (*ar_data->step == NULL_TREE)
471	*ar_data->step = step;
472	else
473	*ar_data->step = fold_build2 (PLUS_EXPR, sizetype,
474	fold_convert (sizetype, *ar_data->step),
475	fold_convert (sizetype, step));
476	*ar_data->delta += idelta;
477	*index = ibase;
478
479	return true;
480	}
481
482	/ Tries to express REF_P in shape &BASE + STEP * iter + DELTA, where DELTA and*
483	STEP are integer constants and iter is number of iterations of LOOP. The
484	reference occurs in statement STMT. Strips nonaddressable component
485	references from REF_P. /*
486
487	static bool
488	analyze_ref (class loop loop, tree ref_p, tree *base,
489	tree step, HOST_WIDE_INT delta,
490	gimple *stmt)
491	{
492	struct ar_data ar_data;
493	tree off;
494	HOST_WIDE_INT bit_offset;
495	tree ref = *ref_p;
496
497	*step = NULL_TREE;
498	*delta = `0`;
499
500	/ First strip off the component references. Ignore bitfields.*
501	Also strip off the real and imagine parts of a complex, so that
502	they can have the same base. /*
503	if (TREE_CODE (ref) == REALPART_EXPR
504	\|\| TREE_CODE (ref) == IMAGPART_EXPR
505	\|\| (TREE_CODE (ref) == COMPONENT_REF
506	&& DECL_NONADDRESSABLE_P (TREE_OPERAND (ref, `1`))))
507	{
508	if (TREE_CODE (ref) == IMAGPART_EXPR)
509	*delta += int_size_in_bytes (TREE_TYPE (ref));
510	ref = TREE_OPERAND (ref, `0`);
511	}
512
513	*ref_p = ref;
514
515	for (; TREE_CODE (ref) == COMPONENT_REF; ref = TREE_OPERAND (ref, `0`))
516	{
517	off = DECL_FIELD_BIT_OFFSET (TREE_OPERAND (ref, `1`));
518	bit_offset = TREE_INT_CST_LOW (off);
519	gcc_assert (bit_offset % BITS_PER_UNIT == `0`);
520
521	*delta += bit_offset / BITS_PER_UNIT;
522	}
523
524	*base = unshare_expr (ref);
525	ar_data.loop = loop;
526	ar_data.stmt = stmt;
527	ar_data.step = step;
528	ar_data.delta = delta;
529	return for_each_index (base, idx_analyze_ref, &ar_data);
530	}
531
532	/ Record a memory reference REF to the list REFS. The reference occurs in*
533	LOOP in statement STMT and it is write if WRITE_P. Returns true if the
534	reference was recorded, false otherwise. /*
535
536	static bool
537	gather_memory_references_ref (class loop loop, struct* mem_ref_group **refs,
538	tree ref, bool write_p, gimple *stmt)
539	{
540	tree base, step;
541	HOST_WIDE_INT delta;
542	struct mem_ref_group *agrp;
543
544	if (get_base_address (t: ref) == NULL)
545	return false;
546
547	if (!analyze_ref (loop, ref_p: &ref, base: &base, step: &step, delta: &delta, stmt))
548	return false;
549	/ If analyze_ref fails the default is a NULL_TREE. We can stop here. /
550	if (step == NULL_TREE)
551	return false;
552
553	/ Stop if the address of BASE could not be taken. /
554	if (may_be_nonaddressable_p (expr: base))
555	return false;
556
557	/ Limit non-constant step prefetching only to the innermost loops and*
558	only when the step is loop invariant in the entire loop nest. /*
559	if (!cst_and_fits_in_hwi (step))
560	{
561	if (loop->inner != NULL)
562	{
563	if (dump_file && (dump_flags & TDF_DETAILS))
564	{
565	fprintf (stream: dump_file, format: "Memory expression %p\n",(void *) ref );
566	print_generic_expr (dump_file, ref, TDF_SLIM);
567	fprintf (stream: dump_file,format: ":");
568	dump_mem_details (file: dump_file, base, step, delta, write_p);
569	fprintf (stream: dump_file,
570	format: "Ignoring %p, non-constant step prefetching is "
571	"limited to inner most loops \n",
572	(void *) ref);
573	}
574	return false;
575	}
576	else
577	{
578	if (!expr_invariant_in_loop_p (loop_outermost (loop), step))
579	{
580	if (dump_file && (dump_flags & TDF_DETAILS))
581	{
582	fprintf (stream: dump_file, format: "Memory expression %p\n",(void *) ref );
583	print_generic_expr (dump_file, ref, TDF_SLIM);
584	fprintf (stream: dump_file,format: ":");
585	dump_mem_details (file: dump_file, base, step, delta, write_p);
586	fprintf (stream: dump_file,
587	format: "Not prefetching, ignoring %p due to "
588	"loop variant step\n",
589	(void *) ref);
590	}
591	return false;
592	}
593	}
594	}
595
596	/ Now we know that REF = &BASE + STEP * iter + DELTA, where DELTA and STEP*
597	are integer constants. /*
598	agrp = find_or_create_group (groups: refs, base, step);
599	record_ref (group: agrp, stmt, mem: ref, delta, write_p);
600
601	return true;
602	}
603
604	/ Record the suitable memory references in LOOP. NO_OTHER_REFS is set to*
605	true if there are no other memory references inside the loop. /*
606
607	static struct mem_ref_group *
608	gather_memory_references (class loop loop, bool* no_other_refs, unsigned* *ref_count)
609	{
610	basic_block *body = get_loop_body_in_dom_order (loop);
611	basic_block bb;
612	unsigned i;
613	gimple_stmt_iterator bsi;
614	gimple *stmt;
615	tree lhs, rhs;
616	struct mem_ref_group *refs = NULL;
617
618	no_other_refs = true*;
619	*ref_count = `0`;
620
621	/ Scan the loop body in order, so that the former references precede the*
622	later ones. /*
623	for (i = `0`; i < loop->num_nodes; i++)
624	{
625	bb = body[i];
626	if (bb->loop_father != loop)
627	continue;
628
629	for (bsi = gsi_start_bb (bb); !gsi_end_p (i: bsi); gsi_next (i: &bsi))
630	{
631	stmt = gsi_stmt (i: bsi);
632
633	if (gimple_code (g: stmt) != GIMPLE_ASSIGN)
634	{
635	if (gimple_vuse (g: stmt)
636	\|\| (is_gimple_call (gs: stmt)
637	&& !(gimple_call_flags (stmt) & ECF_CONST)))
638	no_other_refs = false*;
639	continue;
640	}
641
642	if (! gimple_vuse (g: stmt))
643	continue;
644
645	lhs = gimple_assign_lhs (gs: stmt);
646	rhs = gimple_assign_rhs1 (gs: stmt);
647
648	if (REFERENCE_CLASS_P (rhs))
649	{
650	*no_other_refs &= gather_memory_references_ref (loop, refs: &refs,
651	ref: rhs, write_p: false, stmt);
652	*ref_count += `1`;
653	}
654	if (REFERENCE_CLASS_P (lhs))
655	{
656	*no_other_refs &= gather_memory_references_ref (loop, refs: &refs,
657	ref: lhs, write_p: true, stmt);
658	*ref_count += `1`;
659	}
660	}
661	}
662	free (ptr: body);
663
664	return refs;
665	}
666
667	/ Prune the prefetch candidate REF using the self-reuse. /
668
669	static void
670	prune_ref_by_self_reuse (struct mem_ref *ref)
671	{
672	HOST_WIDE_INT step;
673	bool backward;
674
675	/ If the step size is non constant, we cannot calculate prefetch_mod. /
676	if (!cst_and_fits_in_hwi (ref->group->step))
677	return;
678
679	step = int_cst_value (ref->group->step);
680
681	backward = step < `0`;
682
683	if (step == `0`)
684	{
685	/ Prefetch references to invariant address just once. /
686	ref->prefetch_before = `1`;
687	return;
688	}
689
690	if (backward)
691	step = -step;
692
693	if (step > PREFETCH_BLOCK)
694	return;
695
696	if ((backward && HAVE_BACKWARD_PREFETCH)
697	\|\| (!backward && HAVE_FORWARD_PREFETCH))
698	{
699	ref->prefetch_before = `1`;
700	return;
701	}
702
703	ref->prefetch_mod = PREFETCH_BLOCK / step;
704	}
705
706	/ Divides X by BY, rounding down. /
707
708	static HOST_WIDE_INT
709	ddown (HOST_WIDE_INT x, unsigned HOST_WIDE_INT by)
710	{
711	gcc_assert (by > `0`);
712
713	if (x >= `0`)
714	return x / (HOST_WIDE_INT) by;
715	else
716	return (x + (HOST_WIDE_INT) by - `1`) / (HOST_WIDE_INT) by;
717	}
718
719	/ Given a CACHE_LINE_SIZE and two inductive memory references*
720	with a common STEP greater than CACHE_LINE_SIZE and an address
721	difference DELTA, compute the probability that they will fall
722	in different cache lines. Return true if the computed miss rate
723	is not greater than the ACCEPTABLE_MISS_RATE. DISTINCT_ITERS is the
724	number of distinct iterations after which the pattern repeats itself.
725	ALIGN_UNIT is the unit of alignment in bytes. /*
726
727	static bool
728	is_miss_rate_acceptable (unsigned HOST_WIDE_INT cache_line_size,
729	HOST_WIDE_INT step, HOST_WIDE_INT delta,
730	unsigned HOST_WIDE_INT distinct_iters,
731	int align_unit)
732	{
733	unsigned align, iter;
734	int total_positions, miss_positions, max_allowed_miss_positions;
735	int address1, address2, cache_line1, cache_line2;
736
737	/ It always misses if delta is greater than or equal to the cache*
738	line size. /*
739	if (delta >= (HOST_WIDE_INT) cache_line_size)
740	return false;
741
742	miss_positions = `0`;
743	total_positions = (cache_line_size / align_unit) * distinct_iters;
744	max_allowed_miss_positions = (ACCEPTABLE_MISS_RATE * total_positions) / `1000`;
745
746	/ Iterate through all possible alignments of the first*
747	memory reference within its cache line. /*
748	for (align = `0`; align < cache_line_size; align += align_unit)
749
750	/ Iterate through all distinct iterations. /
751	for (iter = `0`; iter < distinct_iters; iter++)
752	{
753	address1 = align + step * iter;
754	address2 = address1 + delta;
755	cache_line1 = address1 / cache_line_size;
756	cache_line2 = address2 / cache_line_size;
757	if (cache_line1 != cache_line2)
758	{
759	miss_positions += `1`;
760	if (miss_positions > max_allowed_miss_positions)
761	return false;
762	}
763	}
764	return true;
765	}
766
767	/ Prune the prefetch candidate REF using the reuse with BY.*
768	If BY_IS_BEFORE is true, BY is before REF in the loop. /*
769
770	static void
771	prune_ref_by_group_reuse (struct mem_ref ref, struct* mem_ref *by,
772	bool by_is_before)
773	{
774	HOST_WIDE_INT step;
775	bool backward;
776	HOST_WIDE_INT delta_r = ref->delta, delta_b = by->delta;
777	HOST_WIDE_INT delta = delta_b - delta_r;
778	HOST_WIDE_INT hit_from;
779	unsigned HOST_WIDE_INT prefetch_before, prefetch_block;
780	HOST_WIDE_INT reduced_step;
781	unsigned HOST_WIDE_INT reduced_prefetch_block;
782	tree ref_type;
783	int align_unit;
784
785	/ If the step is non constant we cannot calculate prefetch_before. /
786	if (!cst_and_fits_in_hwi (ref->group->step)) {
787	return;
788	}
789
790	step = int_cst_value (ref->group->step);
791
792	backward = step < `0`;
793
794
795	if (delta == `0`)
796	{
797	/ If the references has the same address, only prefetch the*
798	former. /*
799	if (by_is_before)
800	ref->prefetch_before = `0`;
801
802	return;
803	}
804
805	if (!step)
806	{
807	/ If the reference addresses are invariant and fall into the*
808	same cache line, prefetch just the first one. /*
809	if (!by_is_before)
810	return;
811
812	if (ddown (x: ref->delta, PREFETCH_BLOCK)
813	!= ddown (x: by->delta, PREFETCH_BLOCK))
814	return;
815
816	ref->prefetch_before = `0`;
817	return;
818	}
819
820	/ Only prune the reference that is behind in the array. /
821	if (backward)
822	{
823	if (delta > `0`)
824	return;
825
826	/ Transform the data so that we may assume that the accesses*
827	are forward. /*
828	delta = - delta;
829	step = -step;
830	delta_r = PREFETCH_BLOCK - `1` - delta_r;
831	delta_b = PREFETCH_BLOCK - `1` - delta_b;
832	}
833	else
834	{
835	if (delta < `0`)
836	return;
837	}
838
839	/ Check whether the two references are likely to hit the same cache*
840	line, and how distant the iterations in that it occurs are from
841	each other. /*
842
843	if (step <= PREFETCH_BLOCK)
844	{
845	/ The accesses are sure to meet. Let us check when. /
846	hit_from = ddown (x: delta_b, PREFETCH_BLOCK) * PREFETCH_BLOCK;
847	prefetch_before = (hit_from - delta_r + step - `1`) / step;
848
849	/ Do not reduce prefetch_before if we meet beyond cache size. /
850	if (prefetch_before > absu_hwi (L2_CACHE_SIZE_BYTES / step))
851	prefetch_before = PREFETCH_ALL;
852	if (prefetch_before < ref->prefetch_before)
853	ref->prefetch_before = prefetch_before;
854
855	return;
856	}
857
858	/ A more complicated case with step > prefetch_block. First reduce*
859	the ratio between the step and the cache line size to its simplest
860	terms. The resulting denominator will then represent the number of
861	distinct iterations after which each address will go back to its
862	initial location within the cache line. This computation assumes
863	that PREFETCH_BLOCK is a power of two. /*
864	prefetch_block = PREFETCH_BLOCK;
865	reduced_prefetch_block = prefetch_block;
866	reduced_step = step;
867	while ((reduced_step & `1`) == `0`
868	&& reduced_prefetch_block > `1`)
869	{
870	reduced_step >>= `1`;
871	reduced_prefetch_block >>= `1`;
872	}
873
874	prefetch_before = delta / step;
875	delta %= step;
876	ref_type = TREE_TYPE (ref->mem);
877	align_unit = TYPE_ALIGN (ref_type) / `8`;
878	if (is_miss_rate_acceptable (cache_line_size: prefetch_block, step, delta,
879	distinct_iters: reduced_prefetch_block, align_unit))
880	{
881	/ Do not reduce prefetch_before if we meet beyond cache size. /
882	if (prefetch_before > L2_CACHE_SIZE_BYTES / PREFETCH_BLOCK)
883	prefetch_before = PREFETCH_ALL;
884	if (prefetch_before < ref->prefetch_before)
885	ref->prefetch_before = prefetch_before;
886
887	return;
888	}
889
890	/ Try also the following iteration. /
891	prefetch_before++;
892	delta = step - delta;
893	if (is_miss_rate_acceptable (cache_line_size: prefetch_block, step, delta,
894	distinct_iters: reduced_prefetch_block, align_unit))
895	{
896	if (prefetch_before < ref->prefetch_before)
897	ref->prefetch_before = prefetch_before;
898
899	return;
900	}
901
902	/ The ref probably does not reuse by. /
903	return;
904	}
905
906	/ Prune the prefetch candidate REF using the reuses with other references*
907	in REFS. /*
908
909	static void
910	prune_ref_by_reuse (struct mem_ref ref, struct* mem_ref *refs)
911	{
912	struct mem_ref *prune_by;
913	bool before = true;
914
915	prune_ref_by_self_reuse (ref);
916
917	for (prune_by = refs; prune_by; prune_by = prune_by->next)
918	{
919	if (prune_by == ref)
920	{
921	before = false;
922	continue;
923	}
924
925	if (!WRITE_CAN_USE_READ_PREFETCH
926	&& ref->write_p
927	&& !prune_by->write_p)
928	continue;
929	if (!READ_CAN_USE_WRITE_PREFETCH
930	&& !ref->write_p
931	&& prune_by->write_p)
932	continue;
933
934	prune_ref_by_group_reuse (ref, by: prune_by, by_is_before: before);
935	}
936	}
937
938	/ Prune the prefetch candidates in GROUP using the reuse analysis. /
939
940	static void
941	prune_group_by_reuse (struct mem_ref_group *group)
942	{
943	struct mem_ref *ref_pruned;
944
945	for (ref_pruned = group->refs; ref_pruned; ref_pruned = ref_pruned->next)
946	{
947	prune_ref_by_reuse (ref: ref_pruned, refs: group->refs);
948
949	if (dump_file && (dump_flags & TDF_DETAILS))
950	{
951	dump_mem_ref (file: dump_file, ref: ref_pruned);
952
953	if (ref_pruned->prefetch_before == PREFETCH_ALL
954	&& ref_pruned->prefetch_mod == `1`)
955	fprintf (stream: dump_file, format: " no restrictions");
956	else if (ref_pruned->prefetch_before == `0`)
957	fprintf (stream: dump_file, format: " do not prefetch");
958	else if (ref_pruned->prefetch_before <= ref_pruned->prefetch_mod)
959	fprintf (stream: dump_file, format: " prefetch once");
960	else
961	{
962	if (ref_pruned->prefetch_before != PREFETCH_ALL)
963	{
964	fprintf (stream: dump_file, format: " prefetch before ");
965	fprintf (stream: dump_file, HOST_WIDE_INT_PRINT_DEC,
966	ref_pruned->prefetch_before);
967	}
968	if (ref_pruned->prefetch_mod != `1`)
969	{
970	fprintf (stream: dump_file, format: " prefetch mod ");
971	fprintf (stream: dump_file, HOST_WIDE_INT_PRINT_DEC,
972	ref_pruned->prefetch_mod);
973	}
974	}
975	fprintf (stream: dump_file, format: "\n");
976	}
977	}
978	}
979
980	/ Prune the list of prefetch candidates GROUPS using the reuse analysis. /
981
982	static void
983	prune_by_reuse (struct mem_ref_group *groups)
984	{
985	for (; groups; groups = groups->next)
986	prune_group_by_reuse (group: groups);
987	}
988
989	/ Returns true if we should issue prefetch for REF. /
990
991	static bool
992	should_issue_prefetch_p (struct mem_ref *ref)
993	{
994	/ Do we want to issue prefetches for non-constant strides? /
995	if (!cst_and_fits_in_hwi (ref->group->step)
996	&& param_prefetch_dynamic_strides == `0`)
997	{
998	if (dump_file && (dump_flags & TDF_DETAILS))
999	fprintf (stream: dump_file,
1000	format: "Skipping non-constant step for reference %u:%u\n",
1001	ref->group->uid, ref->uid);
1002	return false;
1003	}
1004
1005	/ Some processors may have a hardware prefetcher that may conflict with*
1006	prefetch hints for a range of strides. Make sure we don't issue
1007	prefetches for such cases if the stride is within this particular
1008	range. /*
1009	if (cst_and_fits_in_hwi (ref->group->step)
1010	&& abs_hwi (x: int_cst_value (ref->group->step))
1011	< (HOST_WIDE_INT) param_prefetch_minimum_stride)
1012	{
1013	if (dump_file && (dump_flags & TDF_DETAILS))
1014	fprintf (stream: dump_file,
1015	format: "Step for reference %u:%u (" HOST_WIDE_INT_PRINT_DEC
1016	") is less than the mininum required stride of %d\n",
1017	ref->group->uid, ref->uid, int_cst_value (ref->group->step),
1018	param_prefetch_minimum_stride);
1019	return false;
1020	}
1021
1022	/ For now do not issue prefetches for only first few of the*
1023	iterations. /*
1024	if (ref->prefetch_before != PREFETCH_ALL)
1025	{
1026	if (dump_file && (dump_flags & TDF_DETAILS))
1027	fprintf (stream: dump_file, format: "Ignoring reference %u:%u due to prefetch_before\n",
1028	ref->group->uid, ref->uid);
1029	return false;
1030	}
1031
1032	/ Do not prefetch nontemporal stores. /
1033	if (ref->storent_p)
1034	{
1035	if (dump_file && (dump_flags & TDF_DETAILS))
1036	fprintf (stream: dump_file, format: "Ignoring nontemporal store reference %u:%u\n", ref->group->uid, ref->uid);
1037	return false;
1038	}
1039
1040	return true;
1041	}
1042
1043	/ Decide which of the prefetch candidates in GROUPS to prefetch.*
1044	AHEAD is the number of iterations to prefetch ahead (which corresponds
1045	to the number of simultaneous instances of one prefetch running at a
1046	time). UNROLL_FACTOR is the factor by that the loop is going to be
1047	unrolled. Returns true if there is anything to prefetch. /*
1048
1049	static bool
1050	schedule_prefetches (struct mem_ref_group groups, unsigned* unroll_factor,
1051	unsigned ahead)
1052	{
1053	unsigned remaining_prefetch_slots, n_prefetches, prefetch_slots;
1054	unsigned slots_per_prefetch;
1055	struct mem_ref *ref;
1056	bool any = false;
1057
1058	/ At most param_simultaneous_prefetches should be running*
1059	at the same time. /*
1060	remaining_prefetch_slots = param_simultaneous_prefetches;
1061
1062	/ The prefetch will run for AHEAD iterations of the original loop, i.e.,*
1063	AHEAD / UNROLL_FACTOR iterations of the unrolled loop. In each iteration,
1064	it will need a prefetch slot. /*
1065	slots_per_prefetch = (ahead + unroll_factor / `2`) / unroll_factor;
1066	if (dump_file && (dump_flags & TDF_DETAILS))
1067	fprintf (stream: dump_file, format: "Each prefetch instruction takes %u prefetch slots.\n",
1068	slots_per_prefetch);
1069
1070	/ For now we just take memory references one by one and issue*
1071	prefetches for as many as possible. The groups are sorted
1072	starting with the largest step, since the references with
1073	large step are more likely to cause many cache misses. /*
1074
1075	for (; groups; groups = groups->next)
1076	for (ref = groups->refs; ref; ref = ref->next)
1077	{
1078	if (!should_issue_prefetch_p (ref))
1079	continue;
1080
1081	/ The loop is far from being sufficiently unrolled for this*
1082	prefetch. Do not generate prefetch to avoid many redudant
1083	prefetches. /*
1084	if (ref->prefetch_mod / unroll_factor > PREFETCH_MOD_TO_UNROLL_FACTOR_RATIO)
1085	continue;
1086
1087	/ If we need to prefetch the reference each PREFETCH_MOD iterations,*
1088	and we unroll the loop UNROLL_FACTOR times, we need to insert
1089	ceil (UNROLL_FACTOR / PREFETCH_MOD) instructions in each
1090	iteration. /*
1091	n_prefetches = ((unroll_factor + ref->prefetch_mod - `1`)
1092	/ ref->prefetch_mod);
1093	prefetch_slots = n_prefetches * slots_per_prefetch;
1094
1095	/ If more than half of the prefetches would be lost anyway, do not*
1096	issue the prefetch. /*
1097	if (`2` * remaining_prefetch_slots < prefetch_slots)
1098	continue;
1099
1100	/ Stop prefetching if debug counter is activated. /
1101	if (!dbg_cnt (index: prefetch))
1102	continue;
1103
1104	ref->issue_prefetch_p = true;
1105	if (dump_file && (dump_flags & TDF_DETAILS))
1106	fprintf (stream: dump_file, format: "Decided to issue prefetch for reference %u:%u\n",
1107	ref->group->uid, ref->uid);
1108
1109	if (remaining_prefetch_slots <= prefetch_slots)
1110	return true;
1111	remaining_prefetch_slots -= prefetch_slots;
1112	any = true;
1113	}
1114
1115	return any;
1116	}
1117
1118	/ Return TRUE if no prefetch is going to be generated in the given*
1119	GROUPS. /*
1120
1121	static bool
1122	nothing_to_prefetch_p (struct mem_ref_group *groups)
1123	{
1124	struct mem_ref *ref;
1125
1126	for (; groups; groups = groups->next)
1127	for (ref = groups->refs; ref; ref = ref->next)
1128	if (should_issue_prefetch_p (ref))
1129	return false;
1130
1131	return true;
1132	}
1133
1134	/ Estimate the number of prefetches in the given GROUPS.*
1135	UNROLL_FACTOR is the factor by which LOOP was unrolled. /*
1136
1137	static int
1138	estimate_prefetch_count (struct mem_ref_group groups, unsigned* unroll_factor)
1139	{
1140	struct mem_ref *ref;
1141	unsigned n_prefetches;
1142	int prefetch_count = `0`;
1143
1144	for (; groups; groups = groups->next)
1145	for (ref = groups->refs; ref; ref = ref->next)
1146	if (should_issue_prefetch_p (ref))
1147	{
1148	n_prefetches = ((unroll_factor + ref->prefetch_mod - `1`)
1149	/ ref->prefetch_mod);
1150	prefetch_count += n_prefetches;
1151	}
1152
1153	return prefetch_count;
1154	}
1155
1156	/ Issue prefetches for the reference REF into loop as decided before.*
1157	HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR
1158	is the factor by which LOOP was unrolled. /*
1159
1160	static void
1161	issue_prefetch_ref (struct mem_ref ref, unsigned* unroll_factor, unsigned ahead)
1162	{
1163	HOST_WIDE_INT delta;
1164	tree addr, addr_base, write_p, local, forward;
1165	gcall *prefetch;
1166	gimple_stmt_iterator bsi;
1167	unsigned n_prefetches, ap;
1168	bool nontemporal = ref->reuse_distance >= L2_CACHE_SIZE_BYTES;
1169
1170	if (dump_file && (dump_flags & TDF_DETAILS))
1171	fprintf (stream: dump_file, format: "Issued%s prefetch for reference %u:%u.\n",
1172	nontemporal ? " nontemporal" : "",
1173	ref->group->uid, ref->uid);
1174
1175	bsi = gsi_for_stmt (ref->stmt);
1176
1177	n_prefetches = ((unroll_factor + ref->prefetch_mod - `1`)
1178	/ ref->prefetch_mod);
1179	addr_base = build_fold_addr_expr_with_type (ref->mem, ptr_type_node);
1180	addr_base = force_gimple_operand_gsi (&bsi, unshare_expr (addr_base),
1181	true, NULL, true, GSI_SAME_STMT);
1182	write_p = ref->write_p ? integer_one_node : integer_zero_node;
1183	local = nontemporal ? integer_zero_node : integer_three_node;
1184
1185	for (ap = `0`; ap < n_prefetches; ap++)
1186	{
1187	if (cst_and_fits_in_hwi (ref->group->step))
1188	{
1189	/ Determine the address to prefetch. /
1190	delta = (ahead + ap * ref->prefetch_mod) *
1191	int_cst_value (ref->group->step);
1192	addr = fold_build_pointer_plus_hwi (addr_base, delta);
1193	addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1194	NULL, true, GSI_SAME_STMT);
1195	}
1196	else
1197	{
1198	/ The step size is non-constant but loop-invariant. We use the*
1199	heuristic to simply prefetch ahead iterations ahead. /*
1200	forward = fold_build2 (MULT_EXPR, sizetype,
1201	fold_convert (sizetype, ref->group->step),
1202	fold_convert (sizetype, size_int (ahead)));
1203	addr = fold_build_pointer_plus (addr_base, forward);
1204	addr = force_gimple_operand_gsi (&bsi, unshare_expr (addr), true,
1205	NULL, true, GSI_SAME_STMT);
1206	}
1207
1208	if (addr_base != addr
1209	&& TREE_CODE (addr_base) == SSA_NAME
1210	&& TREE_CODE (addr) == SSA_NAME)
1211	{
1212	duplicate_ssa_name_ptr_info (addr, SSA_NAME_PTR_INFO (addr_base));
1213	/ As this isn't a plain copy we have to reset alignment*
1214	information. /*
1215	if (SSA_NAME_PTR_INFO (addr))
1216	mark_ptr_info_alignment_unknown (SSA_NAME_PTR_INFO (addr));
1217	}
1218
1219	/ Create the prefetch instruction. /
1220	prefetch = gimple_build_call (builtin_decl_explicit (fncode: BUILT_IN_PREFETCH),
1221	`3`, addr, write_p, local);
1222	gsi_insert_before (&bsi, prefetch, GSI_SAME_STMT);
1223	}
1224	}
1225
1226	/ Issue prefetches for the references in GROUPS into loop as decided before.*
1227	HEAD is the number of iterations to prefetch ahead. UNROLL_FACTOR is the
1228	factor by that LOOP was unrolled. /*
1229
1230	static void
1231	issue_prefetches (struct mem_ref_group *groups,
1232	unsigned unroll_factor, unsigned ahead)
1233	{
1234	struct mem_ref *ref;
1235
1236	for (; groups; groups = groups->next)
1237	for (ref = groups->refs; ref; ref = ref->next)
1238	if (ref->issue_prefetch_p)
1239	issue_prefetch_ref (ref, unroll_factor, ahead);
1240	}
1241
1242	/ Returns true if REF is a memory write for that a nontemporal store insn*
1243	can be used. /*
1244
1245	static bool
1246	nontemporal_store_p (struct mem_ref *ref)
1247	{
1248	machine_mode mode;
1249	enum insn_code code;
1250
1251	/ REF must be a write that is not reused. We require it to be independent*
1252	on all other memory references in the loop, as the nontemporal stores may
1253	be reordered with respect to other memory references. /*
1254	if (!ref->write_p
1255	\|\| !ref->independent_p
1256	\|\| ref->reuse_distance < L2_CACHE_SIZE_BYTES)
1257	return false;
1258
1259	/ Check that we have the storent instruction for the mode. /
1260	mode = TYPE_MODE (TREE_TYPE (ref->mem));
1261	if (mode == BLKmode)
1262	return false;
1263
1264	code = optab_handler (op: storent_optab, mode);
1265	return code != CODE_FOR_nothing;
1266	}
1267
1268	/ If REF is a nontemporal store, we mark the corresponding modify statement*
1269	and return true. Otherwise, we return false. /*
1270
1271	static bool
1272	mark_nontemporal_store (struct mem_ref *ref)
1273	{
1274	if (!nontemporal_store_p (ref))
1275	return false;
1276
1277	if (dump_file && (dump_flags & TDF_DETAILS))
1278	fprintf (stream: dump_file, format: "Marked reference %u:%u as a nontemporal store.\n",
1279	ref->group->uid, ref->uid);
1280
1281	gimple_assign_set_nontemporal_move (gs: ref->stmt, nontemporal: true);
1282	ref->storent_p = true;
1283
1284	return true;
1285	}
1286
1287	/ Issue a memory fence instruction after LOOP. /
1288
1289	static void
1290	emit_mfence_after_loop (class loop *loop)
1291	{
1292	auto_vec<edge> exits = get_loop_exit_edges (loop);
1293	edge exit;
1294	gcall *call;
1295	gimple_stmt_iterator bsi;
1296	unsigned i;
1297
1298	FOR_EACH_VEC_ELT (exits, i, exit)
1299	{
1300	call = gimple_build_call (FENCE_FOLLOWING_MOVNT, `0`);
1301
1302	if (!single_pred_p (bb: exit->dest)
1303	/ If possible, we prefer not to insert the fence on other paths*
1304	in cfg. /*
1305	&& !(exit->flags & EDGE_ABNORMAL))
1306	split_loop_exit_edge (exit);
1307	bsi = gsi_after_labels (bb: exit->dest);
1308
1309	gsi_insert_before (&bsi, call, GSI_NEW_STMT);
1310	}
1311	}
1312
1313	/ Returns true if we can use storent in loop, false otherwise. /
1314
1315	static bool
1316	may_use_storent_in_loop_p (class loop *loop)
1317	{
1318	bool ret = true;
1319
1320	if (loop->inner != NULL)
1321	return false;
1322
1323	/ If we must issue a mfence insn after using storent, check that there*
1324	is a suitable place for it at each of the loop exits. /*
1325	if (FENCE_FOLLOWING_MOVNT != NULL_TREE)
1326	{
1327	auto_vec<edge> exits = get_loop_exit_edges (loop);
1328	unsigned i;
1329	edge exit;
1330
1331	FOR_EACH_VEC_ELT (exits, i, exit)
1332	if ((exit->flags & EDGE_ABNORMAL)
1333	&& exit->dest == EXIT_BLOCK_PTR_FOR_FN (cfun))
1334	ret = false;
1335	}
1336
1337	return ret;
1338	}
1339
1340	/ Marks nontemporal stores in LOOP. GROUPS contains the description of memory*
1341	references in the loop. Returns whether we inserted any mfence call. /*
1342
1343	static bool
1344	mark_nontemporal_stores (class loop loop, struct* mem_ref_group *groups)
1345	{
1346	struct mem_ref *ref;
1347	bool any = false;
1348
1349	if (!may_use_storent_in_loop_p (loop))
1350	return false;
1351
1352	for (; groups; groups = groups->next)
1353	for (ref = groups->refs; ref; ref = ref->next)
1354	any \|= mark_nontemporal_store (ref);
1355
1356	if (any && FENCE_FOLLOWING_MOVNT != NULL_TREE)
1357	{
1358	emit_mfence_after_loop (loop);
1359	return true;
1360	}
1361	return false;
1362	}
1363
1364	/ Determines whether we can profitably unroll LOOP FACTOR times, and if*
1365	this is the case, fill in DESC by the description of number of
1366	iterations. /*
1367
1368	static bool
1369	should_unroll_loop_p (class loop loop, class* tree_niter_desc *desc,
1370	unsigned factor)
1371	{
1372	if (!can_unroll_loop_p (loop, factor, niter: desc))
1373	return false;
1374
1375	/ We only consider loops without control flow for unrolling. This is not*
1376	a hard restriction -- tree_unroll_loop works with arbitrary loops
1377	as well; but the unrolling/prefetching is usually more profitable for
1378	loops consisting of a single basic block, and we want to limit the
1379	code growth. /*
1380	if (loop->num_nodes > `2`)
1381	return false;
1382
1383	return true;
1384	}
1385
1386	/ Determine the coefficient by that unroll LOOP, from the information*
1387	contained in the list of memory references REFS. Description of
1388	number of iterations of LOOP is stored to DESC. NINSNS is the number of
1389	insns of the LOOP. EST_NITER is the estimated number of iterations of
1390	the loop, or -1 if no estimate is available. /*
1391
1392	static unsigned
1393	determine_unroll_factor (class loop loop, struct* mem_ref_group *refs,
1394	unsigned ninsns, class tree_niter_desc *desc,
1395	HOST_WIDE_INT est_niter)
1396	{
1397	unsigned upper_bound;
1398	unsigned nfactor, factor, mod_constraint;
1399	struct mem_ref_group *agp;
1400	struct mem_ref *ref;
1401
1402	/ First check whether the loop is not too large to unroll. We ignore*
1403	PARAM_MAX_UNROLL_TIMES, because for small loops, it prevented us
1404	from unrolling them enough to make exactly one cache line covered by each
1405	iteration. Also, the goal of PARAM_MAX_UNROLL_TIMES is to prevent
1406	us from unrolling the loops too many times in cases where we only expect
1407	gains from better scheduling and decreasing loop overhead, which is not
1408	the case here. /*
1409	upper_bound = param_max_unrolled_insns / ninsns;
1410
1411	/ If we unrolled the loop more times than it iterates, the unrolled version*
1412	of the loop would be never entered. /*
1413	if (est_niter >= `0` && est_niter < (HOST_WIDE_INT) upper_bound)
1414	upper_bound = est_niter;
1415
1416	if (upper_bound <= `1`)
1417	return `1`;
1418
1419	/ Choose the factor so that we may prefetch each cache just once,*
1420	but bound the unrolling by UPPER_BOUND. /*
1421	factor = `1`;
1422	for (agp = refs; agp; agp = agp->next)
1423	for (ref = agp->refs; ref; ref = ref->next)
1424	if (should_issue_prefetch_p (ref))
1425	{
1426	mod_constraint = ref->prefetch_mod;
1427	nfactor = least_common_multiple (mod_constraint, factor);
1428	if (nfactor <= upper_bound)
1429	factor = nfactor;
1430	}
1431
1432	if (!should_unroll_loop_p (loop, desc, factor))
1433	return `1`;
1434
1435	return factor;
1436	}
1437
1438	/ Returns the total volume of the memory references REFS, taking into account*
1439	reuses in the innermost loop and cache line size. TODO -- we should also
1440	take into account reuses across the iterations of the loops in the loop
1441	nest. /*
1442
1443	static unsigned
1444	volume_of_references (struct mem_ref_group *refs)
1445	{
1446	unsigned volume = `0`;
1447	struct mem_ref_group *gr;
1448	struct mem_ref *ref;
1449
1450	for (gr = refs; gr; gr = gr->next)
1451	for (ref = gr->refs; ref; ref = ref->next)
1452	{
1453	/ Almost always reuses another value? /
1454	if (ref->prefetch_before != PREFETCH_ALL)
1455	continue;
1456
1457	/ If several iterations access the same cache line, use the size of*
1458	the line divided by this number. Otherwise, a cache line is
1459	accessed in each iteration. TODO -- in the latter case, we should
1460	take the size of the reference into account, rounding it up on cache
1461	line size multiple. /*
1462	volume += param_l1_cache_line_size / ref->prefetch_mod;
1463	}
1464	return volume;
1465	}
1466
1467	/ Returns the volume of memory references accessed across VEC iterations of*
1468	loops, whose sizes are described in the LOOP_SIZES array. N is the number
1469	of the loops in the nest (length of VEC and LOOP_SIZES vectors). /*
1470
1471	static unsigned
1472	volume_of_dist_vector (lambda_vector vec, unsigned loop_sizes, unsigned* n)
1473	{
1474	unsigned i;
1475
1476	for (i = `0`; i < n; i++)
1477	if (vec[i] != `0`)
1478	break;
1479
1480	if (i == n)
1481	return `0`;
1482
1483	gcc_assert (vec[i] > `0`);
1484
1485	/ We ignore the parts of the distance vector in subloops, since usually*
1486	the numbers of iterations are much smaller. /*
1487	return loop_sizes[i] * vec[i];
1488	}
1489
1490	/ Add the steps of ACCESS_FN multiplied by STRIDE to the array STRIDE*
1491	at the position corresponding to the loop of the step. N is the depth
1492	of the considered loop nest, and, LOOP is its innermost loop. /*
1493
1494	static void
1495	add_subscript_strides (tree access_fn, unsigned stride,
1496	HOST_WIDE_INT strides, unsigned* n, class loop *loop)
1497	{
1498	class loop *aloop;
1499	tree step;
1500	HOST_WIDE_INT astep;
1501	unsigned min_depth = loop_depth (loop) - n;
1502
1503	while (TREE_CODE (access_fn) == POLYNOMIAL_CHREC)
1504	{
1505	aloop = get_chrec_loop (chrec: access_fn);
1506	step = CHREC_RIGHT (access_fn);
1507	access_fn = CHREC_LEFT (access_fn);
1508
1509	if ((unsigned) loop_depth (loop: aloop) <= min_depth)
1510	continue;
1511
1512	if (tree_fits_shwi_p (step))
1513	astep = tree_to_shwi (step);
1514	else
1515	astep = param_l1_cache_line_size;
1516
1517	strides[n - `1` - loop_depth (loop) + loop_depth (loop: aloop)] += astep * stride;
1518
1519	}
1520	}
1521
1522	/ Returns the volume of memory references accessed between two consecutive*
1523	self-reuses of the reference DR. We consider the subscripts of DR in N
1524	loops, and LOOP_SIZES contains the volumes of accesses in each of the
1525	loops. LOOP is the innermost loop of the current loop nest. /*
1526
1527	static unsigned
1528	self_reuse_distance (data_reference_p dr, unsigned loop_sizes, unsigned* n,
1529	class loop *loop)
1530	{
1531	tree stride, access_fn;
1532	HOST_WIDE_INT *strides, astride;
1533	vec<tree> access_fns;
1534	tree ref = DR_REF (dr);
1535	unsigned i, ret = ~`0u`;
1536
1537	/ In the following example:*
1538
1539	for (i = 0; i < N; i++)
1540	for (j = 0; j < N; j++)
1541	use (a[j][i]);
1542	the same cache line is accessed each N steps (except if the change from
1543	i to i + 1 crosses the boundary of the cache line). Thus, for self-reuse,
1544	we cannot rely purely on the results of the data dependence analysis.
1545
1546	Instead, we compute the stride of the reference in each loop, and consider
1547	the innermost loop in that the stride is less than cache size. /*
1548
1549	strides = XCNEWVEC (HOST_WIDE_INT, n);
1550	access_fns = DR_ACCESS_FNS (dr);
1551
1552	FOR_EACH_VEC_ELT (access_fns, i, access_fn)
1553	{
1554	/ Keep track of the reference corresponding to the subscript, so that we*
1555	know its stride. /*
1556	while (handled_component_p (t: ref) && TREE_CODE (ref) != ARRAY_REF)
1557	ref = TREE_OPERAND (ref, `0`);
1558
1559	if (TREE_CODE (ref) == ARRAY_REF)
1560	{
1561	stride = TYPE_SIZE_UNIT (TREE_TYPE (ref));
1562	if (tree_fits_uhwi_p (stride))
1563	astride = tree_to_uhwi (stride);
1564	else
1565	astride = param_l1_cache_line_size;
1566
1567	ref = TREE_OPERAND (ref, `0`);
1568	}
1569	else
1570	astride = `1`;
1571
1572	add_subscript_strides (access_fn, stride: astride, strides, n, loop);
1573	}
1574
1575	for (i = n; i-- > `0`; )
1576	{
1577	unsigned HOST_WIDE_INT s;
1578
1579	s = strides[i] < `0` ? -strides[i] : strides[i];
1580
1581	if (s < (unsigned) param_l1_cache_line_size
1582	&& (loop_sizes[i]
1583	> (unsigned) (L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)))
1584	{
1585	ret = loop_sizes[i];
1586	break;
1587	}
1588	}
1589
1590	free (ptr: strides);
1591	return ret;
1592	}
1593
1594	/ Determines the distance till the first reuse of each reference in REFS*
1595	in the loop nest of LOOP. NO_OTHER_REFS is true if there are no other
1596	memory references in the loop. Return false if the analysis fails. /*
1597
1598	static bool
1599	determine_loop_nest_reuse (class loop loop, struct* mem_ref_group *refs,
1600	bool no_other_refs)
1601	{
1602	class loop nest, aloop;
1603	vec<data_reference_p> datarefs = vNULL;
1604	vec<ddr_p> dependences = vNULL;
1605	struct mem_ref_group *gr;
1606	struct mem_ref ref, refb;
1607	auto_vec<loop_p> vloops;
1608	unsigned *loop_data_size;
1609	unsigned i, j, n;
1610	unsigned volume, dist, adist;
1611	HOST_WIDE_INT vol;
1612	data_reference_p dr;
1613	ddr_p dep;
1614
1615	if (loop->inner)
1616	return true;
1617
1618	/ Find the outermost loop of the loop nest of loop (we require that*
1619	there are no sibling loops inside the nest). /*
1620	nest = loop;
1621	while (`1`)
1622	{
1623	aloop = loop_outer (loop: nest);
1624
1625	if (aloop == current_loops->tree_root
1626	\|\| aloop->inner->next)
1627	break;
1628
1629	nest = aloop;
1630	}
1631
1632	/ For each loop, determine the amount of data accessed in each iteration.*
1633	We use this to estimate whether the reference is evicted from the
1634	cache before its reuse. /*
1635	find_loop_nest (nest, &vloops);
1636	n = vloops.length ();
1637	loop_data_size = XNEWVEC (unsigned, n);
1638	volume = volume_of_references (refs);
1639	i = n;
1640	while (i-- != `0`)
1641	{
1642	loop_data_size[i] = volume;
1643	/ Bound the volume by the L2 cache size, since above this bound,*
1644	all dependence distances are equivalent. /*
1645	if (volume > L2_CACHE_SIZE_BYTES)
1646	continue;
1647
1648	aloop = vloops [i];
1649	vol = estimated_stmt_executions_int (aloop);
1650	if (vol == -`1`)
1651	vol = expected_loop_iterations (aloop);
1652	volume *= vol;
1653	}
1654
1655	/ Prepare the references in the form suitable for data dependence*
1656	analysis. We ignore unanalyzable data references (the results
1657	are used just as a heuristics to estimate temporality of the
1658	references, hence we do not need to worry about correctness). /*
1659	for (gr = refs; gr; gr = gr->next)
1660	for (ref = gr->refs; ref; ref = ref->next)
1661	{
1662	dr = create_data_ref (loop_preheader_edge (nest),
1663	loop_containing_stmt (stmt: ref->stmt),
1664	ref->mem, ref->stmt, !ref->write_p, false);
1665
1666	if (dr)
1667	{
1668	ref->reuse_distance = volume;
1669	dr->aux = ref;
1670	datarefs.safe_push (obj: dr);
1671	}
1672	else
1673	no_other_refs = false;
1674	}
1675
1676	FOR_EACH_VEC_ELT (datarefs, i, dr)
1677	{
1678	dist = self_reuse_distance (dr, loop_sizes: loop_data_size, n, loop);
1679	ref = (struct mem_ref *) dr->aux;
1680	if (ref->reuse_distance > dist)
1681	ref->reuse_distance = dist;
1682
1683	if (no_other_refs)
1684	ref->independent_p = true;
1685	}
1686
1687	if (!compute_all_dependences (datarefs, &dependences, vloops, true))
1688	return false;
1689
1690	FOR_EACH_VEC_ELT (dependences, i, dep)
1691	{
1692	if (DDR_ARE_DEPENDENT (dep) == chrec_known)
1693	continue;
1694
1695	ref = (struct mem_ref *) DDR_A (dep)->aux;
1696	refb = (struct mem_ref *) DDR_B (dep)->aux;
1697
1698	if (DDR_ARE_DEPENDENT (dep) == chrec_dont_know
1699	\|\| DDR_COULD_BE_INDEPENDENT_P (dep)
1700	\|\| DDR_NUM_DIST_VECTS (dep) == `0`)
1701	{
1702	/ If the dependence cannot be analyzed, assume that there might be*
1703	a reuse. /*
1704	dist = `0`;
1705
1706	ref->independent_p = false;
1707	refb->independent_p = false;
1708	}
1709	else
1710	{
1711	/ The distance vectors are normalized to be always lexicographically*
1712	positive, hence we cannot tell just from them whether DDR_A comes
1713	before DDR_B or vice versa. However, it is not important,
1714	anyway -- if DDR_A is close to DDR_B, then it is either reused in
1715	DDR_B (and it is not nontemporal), or it reuses the value of DDR_B
1716	in cache (and marking it as nontemporal would not affect
1717	anything). /*
1718
1719	dist = volume;
1720	for (j = `0`; j < DDR_NUM_DIST_VECTS (dep); j++)
1721	{
1722	adist = volume_of_dist_vector (DDR_DIST_VECT (dep, j),
1723	loop_sizes: loop_data_size, n);
1724
1725	/ If this is a dependence in the innermost loop (i.e., the*
1726	distances in all superloops are zero) and it is not
1727	the trivial self-dependence with distance zero, record that
1728	the references are not completely independent. /*
1729	if (lambda_vector_zerop (DDR_DIST_VECT (dep, j), size: n - `1`)
1730	&& (ref != refb
1731	\|\| DDR_DIST_VECT (dep, j)[n-`1`] != `0`))
1732	{
1733	ref->independent_p = false;
1734	refb->independent_p = false;
1735	}
1736
1737	/ Ignore accesses closer than*
1738	L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION,
1739	so that we use nontemporal prefetches e.g. if single memory
1740	location is accessed several times in a single iteration of
1741	the loop. /*
1742	if (adist < L1_CACHE_SIZE_BYTES / NONTEMPORAL_FRACTION)
1743	continue;
1744
1745	if (adist < dist)
1746	dist = adist;
1747	}
1748	}
1749
1750	if (ref->reuse_distance > dist)
1751	ref->reuse_distance = dist;
1752	if (refb->reuse_distance > dist)
1753	refb->reuse_distance = dist;
1754	}
1755
1756	free_dependence_relations (dependences);
1757	free_data_refs (datarefs);
1758	free (ptr: loop_data_size);
1759
1760	if (dump_file && (dump_flags & TDF_DETAILS))
1761	{
1762	fprintf (stream: dump_file, format: "Reuse distances:\n");
1763	for (gr = refs; gr; gr = gr->next)
1764	for (ref = gr->refs; ref; ref = ref->next)
1765	fprintf (stream: dump_file, format: " reference %u:%u distance %u\n",
1766	ref->group->uid, ref->uid, ref->reuse_distance);
1767	}
1768
1769	return true;
1770	}
1771
1772	/ Determine whether or not the trip count to ahead ratio is too small based*
1773	on prefitablility consideration.
1774	AHEAD: the iteration ahead distance,
1775	EST_NITER: the estimated trip count. /*
1776
1777	static bool
1778	trip_count_to_ahead_ratio_too_small_p (unsigned ahead, HOST_WIDE_INT est_niter)
1779	{
1780	/ Assume trip count to ahead ratio is big enough if the trip count could not*
1781	be estimated at compile time. /*
1782	if (est_niter < `0`)
1783	return false;
1784
1785	if (est_niter < (HOST_WIDE_INT) (TRIP_COUNT_TO_AHEAD_RATIO * ahead))
1786	{
1787	if (dump_file && (dump_flags & TDF_DETAILS))
1788	fprintf (stream: dump_file,
1789	format: "Not prefetching -- loop estimated to roll only %d times\n",
1790	(int) est_niter);
1791	return true;
1792	}
1793
1794	return false;
1795	}
1796
1797	/ Determine whether or not the number of memory references in the loop is*
1798	reasonable based on the profitablity and compilation time considerations.
1799	NINSNS: estimated number of instructions in the loop,
1800	MEM_REF_COUNT: total number of memory references in the loop. /*
1801
1802	static bool
1803	mem_ref_count_reasonable_p (unsigned ninsns, unsigned mem_ref_count)
1804	{
1805	int insn_to_mem_ratio;
1806
1807	if (mem_ref_count == `0`)
1808	return false;
1809
1810	/ Miss rate computation (is_miss_rate_acceptable) and dependence analysis*
1811	(compute_all_dependences) have high costs based on quadratic complexity.
1812	To avoid huge compilation time, we give up prefetching if mem_ref_count
1813	is too large. /*
1814	if (mem_ref_count > PREFETCH_MAX_MEM_REFS_PER_LOOP)
1815	return false;
1816
1817	/ Prefetching improves performance by overlapping cache missing*
1818	memory accesses with CPU operations. If the loop does not have
1819	enough CPU operations to overlap with memory operations, prefetching
1820	won't give a significant benefit. One approximate way of checking
1821	this is to require the ratio of instructions to memory references to
1822	be above a certain limit. This approximation works well in practice.
1823	TODO: Implement a more precise computation by estimating the time
1824	for each CPU or memory op in the loop. Time estimates for memory ops
1825	should account for cache misses. /*
1826	insn_to_mem_ratio = ninsns / mem_ref_count;
1827
1828	if (insn_to_mem_ratio < param_prefetch_min_insn_to_mem_ratio)
1829	{
1830	if (dump_file && (dump_flags & TDF_DETAILS))
1831	fprintf (stream: dump_file,
1832	format: "Not prefetching -- instruction to memory reference ratio (%d) too small\n",
1833	insn_to_mem_ratio);
1834	return false;
1835	}
1836
1837	return true;
1838	}
1839
1840	/ Determine whether or not the instruction to prefetch ratio in the loop is*
1841	too small based on the profitablity consideration.
1842	NINSNS: estimated number of instructions in the loop,
1843	PREFETCH_COUNT: an estimate of the number of prefetches,
1844	UNROLL_FACTOR: the factor to unroll the loop if prefetching. /*
1845
1846	static bool
1847	insn_to_prefetch_ratio_too_small_p (unsigned ninsns, unsigned prefetch_count,
1848	unsigned unroll_factor)
1849	{
1850	int insn_to_prefetch_ratio;
1851
1852	/ Prefetching most likely causes performance degradation when the instruction*
1853	to prefetch ratio is too small. Too many prefetch instructions in a loop
1854	may reduce the I-cache performance.
1855	(unroll_factor ninsns) is used to estimate the number of instructions in*
1856	the unrolled loop. This implementation is a bit simplistic -- the number
1857	of issued prefetch instructions is also affected by unrolling. So,
1858	prefetch_mod and the unroll factor should be taken into account when
1859	determining prefetch_count. Also, the number of insns of the unrolled
1860	loop will usually be significantly smaller than the number of insns of the
1861	original loop unroll_factor (at least the induction variable increases*
1862	and the exit branches will get eliminated), so it might be better to use
1863	tree_estimate_loop_size + estimated_unrolled_size. /*
1864	insn_to_prefetch_ratio = (unroll_factor * ninsns) / prefetch_count;
1865	if (insn_to_prefetch_ratio < param_min_insn_to_prefetch_ratio)
1866	{
1867	if (dump_file && (dump_flags & TDF_DETAILS))
1868	fprintf (stream: dump_file,
1869	format: "Not prefetching -- instruction to prefetch ratio (%d) too small\n",
1870	insn_to_prefetch_ratio);
1871	return true;
1872	}
1873
1874	return false;
1875	}
1876
1877
1878	/ Issue prefetch instructions for array references in LOOP. Returns*
1879	true if the LOOP was unrolled and updates NEED_LC_SSA_UPDATE if we need
1880	to update SSA for virtual operands and LC SSA for a split edge. /*
1881
1882	static bool
1883	loop_prefetch_arrays (class loop loop, bool* &need_lc_ssa_update)
1884	{
1885	struct mem_ref_group *refs;
1886	unsigned ahead, ninsns, time, unroll_factor;
1887	HOST_WIDE_INT est_niter;
1888	class tree_niter_desc desc;
1889	bool unrolled = false, no_other_refs;
1890	unsigned prefetch_count;
1891	unsigned mem_ref_count;
1892
1893	if (optimize_loop_nest_for_size_p (loop))
1894	{
1895	if (dump_file && (dump_flags & TDF_DETAILS))
1896	fprintf (stream: dump_file, format: " ignored (cold area)\n");
1897	return false;
1898	}
1899
1900	/ FIXME: the time should be weighted by the probabilities of the blocks in*
1901	the loop body. /*
1902	time = tree_num_loop_insns (loop, &eni_time_weights);
1903	if (time == `0`)
1904	return false;
1905
1906	ahead = (param_prefetch_latency + time - `1`) / time;
1907	est_niter = estimated_stmt_executions_int (loop);
1908	if (est_niter == -`1`)
1909	est_niter = likely_max_stmt_executions_int (loop);
1910
1911	/ Prefetching is not likely to be profitable if the trip count to ahead*
1912	ratio is too small. /*
1913	if (trip_count_to_ahead_ratio_too_small_p (ahead, est_niter))
1914	return false;
1915
1916	ninsns = tree_num_loop_insns (loop, &eni_size_weights);
1917
1918	/ Step 1: gather the memory references. /
1919	refs = gather_memory_references (loop, no_other_refs: &no_other_refs, ref_count: &mem_ref_count);
1920
1921	/ Give up prefetching if the number of memory references in the*
1922	loop is not reasonable based on profitablity and compilation time
1923	considerations. /*
1924	if (!mem_ref_count_reasonable_p (ninsns, mem_ref_count))
1925	goto fail;
1926
1927	/ Step 2: estimate the reuse effects. /
1928	prune_by_reuse (groups: refs);
1929
1930	if (nothing_to_prefetch_p (groups: refs))
1931	goto fail;
1932
1933	if (!determine_loop_nest_reuse (loop, refs, no_other_refs))
1934	goto fail;
1935
1936	/ Step 3: determine unroll factor. /
1937	unroll_factor = determine_unroll_factor (loop, refs, ninsns, desc: &desc,
1938	est_niter);
1939
1940	/ Estimate prefetch count for the unrolled loop. /
1941	prefetch_count = estimate_prefetch_count (groups: refs, unroll_factor);
1942	if (prefetch_count == `0`)
1943	goto fail;
1944
1945	if (dump_file && (dump_flags & TDF_DETAILS))
1946	fprintf (stream: dump_file, format: "Ahead %d, unroll factor %d, trip count "
1947	HOST_WIDE_INT_PRINT_DEC "\n"
1948	"insn count %d, mem ref count %d, prefetch count %d\n",
1949	ahead, unroll_factor, est_niter,
1950	ninsns, mem_ref_count, prefetch_count);
1951
1952	/ Prefetching is not likely to be profitable if the instruction to prefetch*
1953	ratio is too small. /*
1954	if (insn_to_prefetch_ratio_too_small_p (ninsns, prefetch_count,
1955	unroll_factor))
1956	goto fail;
1957
1958	need_lc_ssa_update \|= mark_nontemporal_stores (loop, groups: refs);
1959
1960	/ Step 4: what to prefetch? /
1961	if (!schedule_prefetches (groups: refs, unroll_factor, ahead))
1962	goto fail;
1963
1964	/ Step 5: unroll the loop. TODO -- peeling of first and last few*
1965	iterations so that we do not issue superfluous prefetches. /*
1966	if (unroll_factor != `1`)
1967	{
1968	tree_unroll_loop (loop, unroll_factor, &desc);
1969	unrolled = true;
1970	}
1971
1972	/ Step 6: issue the prefetches. /
1973	issue_prefetches (groups: refs, unroll_factor, ahead);
1974
1975	fail:
1976	release_mem_refs (groups: refs);
1977	return unrolled;
1978	}
1979
1980	/ Issue prefetch instructions for array references in loops. /
1981
1982	unsigned int
1983	tree_ssa_prefetch_arrays (void)
1984	{
1985	bool unrolled = false;
1986	bool need_lc_ssa_update = false;
1987	int todo_flags = `0`;
1988
1989	if (!targetm.have_prefetch ()
1990	/ It is possible to ask compiler for say -mtune=i486 -march=pentium4.*
1991	-mtune=i486 causes us having PREFETCH_BLOCK 0, since this is part
1992	of processor costs and i486 does not have prefetch, but
1993	-march=pentium4 causes targetm.have_prefetch to be true. Ugh. /*
1994	\|\| PREFETCH_BLOCK == `0`)
1995	return `0`;
1996
1997	if (dump_file && (dump_flags & TDF_DETAILS))
1998	{
1999	fprintf (stream: dump_file, format: "Prefetching parameters:\n");
2000	fprintf (stream: dump_file, format: " simultaneous prefetches: %d\n",
2001	param_simultaneous_prefetches);
2002	fprintf (stream: dump_file, format: " prefetch latency: %d\n", param_prefetch_latency);
2003	fprintf (stream: dump_file, format: " prefetch block size: %d\n", PREFETCH_BLOCK);
2004	fprintf (stream: dump_file, format: " L1 cache size: %d lines, %d kB\n",
2005	L1_CACHE_SIZE_BYTES / param_l1_cache_line_size,
2006	param_l1_cache_size);
2007	fprintf (stream: dump_file, format: " L1 cache line size: %d\n",
2008	param_l1_cache_line_size);
2009	fprintf (stream: dump_file, format: " L2 cache size: %d kB\n", param_l2_cache_size);
2010	fprintf (stream: dump_file, format: " min insn-to-prefetch ratio: %d \n",
2011	param_min_insn_to_prefetch_ratio);
2012	fprintf (stream: dump_file, format: " min insn-to-mem ratio: %d \n",
2013	param_prefetch_min_insn_to_mem_ratio);
2014	fprintf (stream: dump_file, format: "\n");
2015	}
2016
2017	initialize_original_copy_tables ();
2018
2019	if (!builtin_decl_explicit_p (fncode: BUILT_IN_PREFETCH))
2020	{
2021	tree type = build_function_type_list (void_type_node,
2022	const_ptr_type_node, NULL_TREE);
2023	tree decl = add_builtin_function (name: "__builtin_prefetch", type,
2024	function_code: BUILT_IN_PREFETCH, cl: BUILT_IN_NORMAL,
2025	NULL, NULL_TREE);
2026	DECL_IS_NOVOPS (decl) = true;
2027	set_builtin_decl (fncode: BUILT_IN_PREFETCH, decl, implicit_p: false);
2028	}
2029
2030	for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
2031	{
2032	if (dump_file && (dump_flags & TDF_DETAILS))
2033	fprintf (stream: dump_file, format: "Processing loop %d:\n", loop->num);
2034
2035	unrolled \|= loop_prefetch_arrays (loop, need_lc_ssa_update);
2036
2037	if (dump_file && (dump_flags & TDF_DETAILS))
2038	fprintf (stream: dump_file, format: "\n\n");
2039	}
2040
2041	if (need_lc_ssa_update)
2042	rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_only_virtuals);
2043
2044	if (unrolled)
2045	{
2046	scev_reset ();
2047	todo_flags \|= TODO_cleanup_cfg;
2048	}
2049
2050	free_original_copy_tables ();
2051	return todo_flags;
2052	}
2053
2054	/ Prefetching. /
2055
2056	namespace {
2057
2058	const pass_data pass_data_loop_prefetch =
2059	{
2060	.type: GIMPLE_PASS, / type /
2061	.name: "aprefetch", / name /
2062	.optinfo_flags: OPTGROUP_LOOP, / optinfo_flags /
2063	.tv_id: TV_TREE_PREFETCH, / tv_id /
2064	.properties_required: ( PROP_cfg \| PROP_ssa ), / properties_required /
2065	.properties_provided: `0`, / properties_provided /
2066	.properties_destroyed: `0`, / properties_destroyed /
2067	.todo_flags_start: `0`, / todo_flags_start /
2068	.todo_flags_finish: `0`, / todo_flags_finish /
2069	};
2070
2071	class pass_loop_prefetch : public gimple_opt_pass
2072	{
2073	public:
2074	pass_loop_prefetch (gcc::context *ctxt)
2075	: gimple_opt_pass (pass_data_loop_prefetch, ctxt)
2076	{}
2077
2078	/ opt_pass methods: /
2079	bool gate (function *) final override
2080	{
2081	return flag_prefetch_loop_arrays > `0`;
2082	}
2083	unsigned int execute (function *) final override;
2084
2085	}; // class pass_loop_prefetch
2086
2087	unsigned int
2088	pass_loop_prefetch::execute (function *fun)
2089	{
2090	if (number_of_loops (fn: fun) <= `1`)
2091	return `0`;
2092
2093	if ((PREFETCH_BLOCK & (PREFETCH_BLOCK - `1`)) != `0`)
2094	{
2095	static bool warned = false;
2096
2097	if (!warned)
2098	{
2099	warning (OPT_Wdisabled_optimization,
2100	"%<l1-cache-size%> parameter is not a power of two %d",
2101	PREFETCH_BLOCK);
2102	warned = true;
2103	}
2104	return `0`;
2105	}
2106
2107	return tree_ssa_prefetch_arrays ();
2108	}
2109
2110	} // anon namespace
2111
2112	gimple_opt_pass *
2113	make_pass_loop_prefetch (gcc::context *ctxt)
2114	{
2115	return new pass_loop_prefetch (ctxt);
2116	}
2117
2118
2119

source code of gcc/tree-ssa-loop-prefetch.cc