dl-cacheinfo.h source code [glibc/sysdeps/x86/dl-cacheinfo.h]

1	/ Initialize x86 cache info.*
2	Copyright (C) 2020-2022 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<https://www.gnu.org/licenses/>. /*
18
19	static const struct intel_02_cache_info
20	{
21	unsigned char idx;
22	unsigned char assoc;
23	unsigned char linesize;
24	unsigned char rel_name;
25	unsigned int size;
26	} intel_02_known [] =
27	{
28	#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29	{ `0x06`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `8192` },
30	{ `0x08`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `16384` },
31	{ `0x09`, `4`, `32`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
32	{ `0x0a`, `2`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
33	{ `0x0c`, `4`, `32`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
34	{ `0x0d`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
35	{ `0x0e`, `6`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `24576` },
36	{ `0x21`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
37	{ `0x22`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
38	{ `0x23`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
39	{ `0x25`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
40	{ `0x29`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
41	{ `0x2c`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
42	{ `0x30`, `8`, `64`, M(_SC_LEVEL1_ICACHE_SIZE), `32768` },
43	{ `0x39`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
44	{ `0x3a`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `196608` },
45	{ `0x3b`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
46	{ `0x3c`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
47	{ `0x3d`, `6`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `393216` },
48	{ `0x3e`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
49	{ `0x3f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
50	{ `0x41`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
51	{ `0x42`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
52	{ `0x43`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
53	{ `0x44`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
54	{ `0x45`, `4`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
55	{ `0x46`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
56	{ `0x47`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
57	{ `0x48`, `12`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `3145728` },
58	{ `0x49`, `16`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `4194304` },
59	{ `0x4a`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `6291456` },
60	{ `0x4b`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
61	{ `0x4c`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
62	{ `0x4d`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `16777216` },
63	{ `0x4e`, `24`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `6291456` },
64	{ `0x60`, `8`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
65	{ `0x66`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `8192` },
66	{ `0x67`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `16384` },
67	{ `0x68`, `4`, `64`, M(_SC_LEVEL1_DCACHE_SIZE), `32768` },
68	{ `0x78`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
69	{ `0x79`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `131072` },
70	{ `0x7a`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
71	{ `0x7b`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
72	{ `0x7c`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
73	{ `0x7d`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
74	{ `0x7f`, `2`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
75	{ `0x80`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
76	{ `0x82`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `262144` },
77	{ `0x83`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
78	{ `0x84`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
79	{ `0x85`, `8`, `32`, M(_SC_LEVEL2_CACHE_SIZE), `2097152` },
80	{ `0x86`, `4`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `524288` },
81	{ `0x87`, `8`, `64`, M(_SC_LEVEL2_CACHE_SIZE), `1048576` },
82	{ `0xd0`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `524288` },
83	{ `0xd1`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
84	{ `0xd2`, `4`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
85	{ `0xd6`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `1048576` },
86	{ `0xd7`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
87	{ `0xd8`, `8`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
88	{ `0xdc`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
89	{ `0xdd`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
90	{ `0xde`, `12`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
91	{ `0xe2`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `2097152` },
92	{ `0xe3`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `4194304` },
93	{ `0xe4`, `16`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `8388608` },
94	{ `0xea`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `12582912` },
95	{ `0xeb`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `18874368` },
96	{ `0xec`, `24`, `64`, M(_SC_LEVEL3_CACHE_SIZE), `25165824` },
97	};
98
99	#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101	static int
102	intel_02_known_compare (const void p1, const* void *p2)
103	{
104	const struct intel_02_cache_info *i1;
105	const struct intel_02_cache_info *i2;
106
107	i1 = (const struct intel_02_cache_info *) p1;
108	i2 = (const struct intel_02_cache_info *) p2;
109
110	if (i1->idx == i2->idx)
111	return `0`;
112
113	return i1->idx < i2->idx ? -`1` : `1`;
114	}
115
116
117	static long int
118	__attribute__ ((noinline))
119	intel_check_word (int name, unsigned int value, bool *has_level_2,
120	bool *no_level_2_or_3,
121	const struct cpu_features *cpu_features)
122	{
123	if ((value & `0x80000000`) != `0`)
124	/ The register value is reserved. /
125	return `0`;
126
127	/ Fold the name. The _SC_ constants are always in the order SIZE,*
128	ASSOC, LINESIZE. /*
129	int folded_rel_name = (M(name) / `3`) * `3`;
130
131	while (value != `0`)
132	{
133	unsigned int byte = value & `0xff`;
134
135	if (byte == `0x40`)
136	{
137	*no_level_2_or_3 = true;
138
139	if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140	/ No need to look further. /
141	break;
142	}
143	else if (byte == `0xff`)
144	{
145	/ CPUID leaf 0x4 contains all the information. We need to*
146	iterate over it. /*
147	unsigned int eax;
148	unsigned int ebx;
149	unsigned int ecx;
150	unsigned int edx;
151
152	unsigned int round = `0`;
153	while (`1`)
154	{
155	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
156
157	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
158	if (type == null)
159	/ That was the end. /
160	break;
161
162	unsigned int level = (eax >> `5`) & `0x7`;
163
164	if ((level == `1` && type == data
165	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166	\|\| (level == `1` && type == inst
167	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170	\|\| (level == `4` && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171	{
172	unsigned int offset = M(name) - folded_rel_name;
173
174	if (offset == `0`)
175	/ Cache size. /
176	return (((ebx >> `22`) + `1`)
177	* (((ebx >> `12`) & `0x3ff`) + `1`)
178	* ((ebx & `0xfff`) + `1`)
179	* (ecx + `1`));
180	if (offset == `1`)
181	return (ebx >> `22`) + `1`;
182
183	assert (offset == `2`);
184	return (ebx & `0xfff`) + `1`;
185	}
186
187	++round;
188	}
189	/ There is no other cache information anywhere else. /
190	break;
191	}
192	else
193	{
194	if (byte == `0x49` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195	{
196	/ Intel reused this value. For family 15, model 6 it*
197	specifies the 3rd level cache. Otherwise the 2nd
198	level cache. /*
199	unsigned int family = cpu_features->basic.family;
200	unsigned int model = cpu_features->basic.model;
201
202	if (family == `15` && model == `6`)
203	{
204	/ The level 3 cache is encoded for this model like*
205	the level 2 cache is for other models. Pretend
206	the caller asked for the level 2 cache. /*
207	name = (_SC_LEVEL2_CACHE_SIZE
208	+ (name - _SC_LEVEL3_CACHE_SIZE));
209	folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210	}
211	}
212
213	struct intel_02_cache_info *found;
214	struct intel_02_cache_info search;
215
216	search.idx = byte;
217	found = bsearch (&search, intel_02_known, nintel_02_known,
218	sizeof (intel_02_known[`0`]), intel_02_known_compare);
219	if (found != NULL)
220	{
221	if (found->rel_name == folded_rel_name)
222	{
223	unsigned int offset = M(name) - folded_rel_name;
224
225	if (offset == `0`)
226	/ Cache size. /
227	return found->size;
228	if (offset == `1`)
229	return found->assoc;
230
231	assert (offset == `2`);
232	return found->linesize;
233	}
234
235	if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236	*has_level_2 = true;
237	}
238	}
239
240	/ Next byte for the next round. /
241	value >>= `8`;
242	}
243
244	/ Nothing found. /
245	return `0`;
246	}
247
248
249	static long int __attribute__ ((noinline))
250	handle_intel (int name, const struct cpu_features *cpu_features)
251	{
252	unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254	/ Return -1 for older CPUs. /
255	if (maxidx < `2`)
256	return -`1`;
257
258	/ OK, we can use the CPUID instruction to get all info about the*
259	caches. /*
260	unsigned int cnt = `0`;
261	unsigned int max = `1`;
262	long int result = `0`;
263	bool no_level_2_or_3 = false;
264	bool has_level_2 = false;
265
266	while (cnt++ < max)
267	{
268	unsigned int eax;
269	unsigned int ebx;
270	unsigned int ecx;
271	unsigned int edx;
272	__cpuid (`2`, eax, ebx, ecx, edx);
273
274	/ The low byte of EAX in the first round contain the number of*
275	rounds we have to make. At least one, the one we are already
276	doing. /*
277	if (cnt == `1`)
278	{
279	max = eax & `0xff`;
280	eax &= `0xffffff00`;
281	}
282
283	/ Process the individual registers' value. /
284	result = intel_check_word (name, value: eax, has_level_2: &has_level_2,
285	no_level_2_or_3: &no_level_2_or_3, cpu_features);
286	if (result != `0`)
287	return result;
288
289	result = intel_check_word (name, value: ebx, has_level_2: &has_level_2,
290	no_level_2_or_3: &no_level_2_or_3, cpu_features);
291	if (result != `0`)
292	return result;
293
294	result = intel_check_word (name, value: ecx, has_level_2: &has_level_2,
295	no_level_2_or_3: &no_level_2_or_3, cpu_features);
296	if (result != `0`)
297	return result;
298
299	result = intel_check_word (name, value: edx, has_level_2: &has_level_2,
300	no_level_2_or_3: &no_level_2_or_3, cpu_features);
301	if (result != `0`)
302	return result;
303	}
304
305	if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306	&& no_level_2_or_3)
307	return -`1`;
308
309	return `0`;
310	}
311
312
313	static long int __attribute__ ((noinline))
314	handle_amd (int name)
315	{
316	unsigned int eax;
317	unsigned int ebx;
318	unsigned int ecx;
319	unsigned int edx;
320	__cpuid (`0x80000000`, eax, ebx, ecx, edx);
321
322	/ No level 4 cache (yet). /
323	if (name > _SC_LEVEL3_CACHE_LINESIZE)
324	return `0`;
325
326	unsigned int fn = `0x80000005` + (name >= _SC_LEVEL2_CACHE_SIZE);
327	if (eax < fn)
328	return `0`;
329
330	__cpuid (fn, eax, ebx, ecx, edx);
331
332	if (name < _SC_LEVEL1_DCACHE_SIZE)
333	{
334	name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
335	ecx = edx;
336	}
337
338	switch (name)
339	{
340	case _SC_LEVEL1_DCACHE_SIZE:
341	return (ecx >> `14`) & `0x3fc00`;
342
343	case _SC_LEVEL1_DCACHE_ASSOC:
344	ecx >>= `16`;
345	if ((ecx & `0xff`) == `0xff`)
346	/ Fully associative. /
347	return (ecx << `2`) & `0x3fc00`;
348	return ecx & `0xff`;
349
350	case _SC_LEVEL1_DCACHE_LINESIZE:
351	return ecx & `0xff`;
352
353	case _SC_LEVEL2_CACHE_SIZE:
354	return (ecx & `0xf000`) == `0` ? `0` : (ecx >> `6`) & `0x3fffc00`;
355
356	case _SC_LEVEL2_CACHE_ASSOC:
357	switch ((ecx >> `12`) & `0xf`)
358	{
359	case `0`:
360	case `1`:
361	case `2`:
362	case `4`:
363	return (ecx >> `12`) & `0xf`;
364	case `6`:
365	return `8`;
366	case `8`:
367	return `16`;
368	case `10`:
369	return `32`;
370	case `11`:
371	return `48`;
372	case `12`:
373	return `64`;
374	case `13`:
375	return `96`;
376	case `14`:
377	return `128`;
378	case `15`:
379	return ((ecx >> `6`) & `0x3fffc00`) / (ecx & `0xff`);
380	default:
381	return `0`;
382	}
383	/ NOTREACHED /
384
385	case _SC_LEVEL2_CACHE_LINESIZE:
386	return (ecx & `0xf000`) == `0` ? `0` : ecx & `0xff`;
387
388	case _SC_LEVEL3_CACHE_SIZE:
389	return (edx & `0xf000`) == `0` ? `0` : (edx & `0x3ffc0000`) << `1`;
390
391	case _SC_LEVEL3_CACHE_ASSOC:
392	switch ((edx >> `12`) & `0xf`)
393	{
394	case `0`:
395	case `1`:
396	case `2`:
397	case `4`:
398	return (edx >> `12`) & `0xf`;
399	case `6`:
400	return `8`;
401	case `8`:
402	return `16`;
403	case `10`:
404	return `32`;
405	case `11`:
406	return `48`;
407	case `12`:
408	return `64`;
409	case `13`:
410	return `96`;
411	case `14`:
412	return `128`;
413	case `15`:
414	return ((edx & `0x3ffc0000`) << `1`) / (edx & `0xff`);
415	default:
416	return `0`;
417	}
418	/ NOTREACHED /
419
420	case _SC_LEVEL3_CACHE_LINESIZE:
421	return (edx & `0xf000`) == `0` ? `0` : edx & `0xff`;
422
423	default:
424	assert (! "cannot happen");
425	}
426	return -`1`;
427	}
428
429
430	static long int __attribute__ ((noinline))
431	handle_zhaoxin (int name)
432	{
433	unsigned int eax;
434	unsigned int ebx;
435	unsigned int ecx;
436	unsigned int edx;
437
438	int folded_rel_name = (M(name) / `3`) * `3`;
439
440	unsigned int round = `0`;
441	while (`1`)
442	{
443	__cpuid_count (`4`, round, eax, ebx, ecx, edx);
444
445	enum { null = `0`, data = `1`, inst = `2`, uni = `3` } type = eax & `0x1f`;
446	if (type == null)
447	break;
448
449	unsigned int level = (eax >> `5`) & `0x7`;
450
451	if ((level == `1` && type == data
452	&& folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
453	\|\| (level == `1` && type == inst
454	&& folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
455	\|\| (level == `2` && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
456	\|\| (level == `3` && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
457	{
458	unsigned int offset = M(name) - folded_rel_name;
459
460	if (offset == `0`)
461	/ Cache size. /
462	return (((ebx >> `22`) + `1`)
463	* (((ebx >> `12`) & `0x3ff`) + `1`)
464	* ((ebx & `0xfff`) + `1`)
465	* (ecx + `1`));
466	if (offset == `1`)
467	return (ebx >> `22`) + `1`;
468
469	assert (offset == `2`);
470	return (ebx & `0xfff`) + `1`;
471	}
472
473	++round;
474	}
475
476	/ Nothing found. /
477	return `0`;
478	}
479
480	static void
481	get_common_cache_info (long int shared_ptr, long* int * shared_per_thread_ptr, unsigned int *threads_ptr,
482	long int core)
483	{
484	unsigned int eax;
485	unsigned int ebx;
486	unsigned int ecx;
487	unsigned int edx;
488
489	/ Number of logical processors sharing L2 cache. /
490	int threads_l2;
491
492	/ Number of logical processors sharing L3 cache. /
493	int threads_l3;
494
495	const struct cpu_features *cpu_features = __get_cpu_features ();
496	int max_cpuid = cpu_features->basic.max_cpuid;
497	unsigned int family = cpu_features->basic.family;
498	unsigned int model = cpu_features->basic.model;
499	long int shared = *shared_ptr;
500	long int shared_per_thread = *shared_per_thread_ptr;
501	unsigned int threads = *threads_ptr;
502	bool inclusive_cache = true;
503	bool support_count_mask = true;
504
505	/ Try L3 first. /
506	unsigned int level = `3`;
507
508	if (cpu_features->basic.kind == arch_kind_zhaoxin && family == `6`)
509	support_count_mask = false;
510
511	if (shared <= `0`)
512	{
513	/ Try L2 otherwise. /
514	level = `2`;
515	shared = core;
516	shared_per_thread = core;
517	threads_l2 = `0`;
518	threads_l3 = -`1`;
519	}
520	else
521	{
522	threads_l2 = `0`;
523	threads_l3 = `0`;
524	}
525
526	/ A value of 0 for the HTT bit indicates there is only a single*
527	logical processor. /*
528	if (HAS_CPU_FEATURE (HTT))
529	{
530	/ Figure out the number of logical threads that share the*
531	highest cache level. /*
532	if (max_cpuid >= `4`)
533	{
534	int i = `0`;
535
536	/ Query until cache level 2 and 3 are enumerated. /
537	int check = `0x1` \| (threads_l3 == `0`) << `1`;
538	do
539	{
540	__cpuid_count (`4`, i++, eax, ebx, ecx, edx);
541
542	/ There seems to be a bug in at least some Pentium Ds*
543	which sometimes fail to iterate all cache parameters.
544	Do not loop indefinitely here, stop in this case and
545	assume there is no such information. /*
546	if (cpu_features->basic.kind == arch_kind_intel
547	&& (eax & `0x1f`) == `0` )
548	goto intel_bug_no_cache_info;
549
550	switch ((eax >> `5`) & `0x7`)
551	{
552	default:
553	break;
554	case `2`:
555	if ((check & `0x1`))
556	{
557	/ Get maximum number of logical processors*
558	sharing L2 cache. /*
559	threads_l2 = (eax >> `14`) & `0x3ff`;
560	check &= ~`0x1`;
561	}
562	break;
563	case `3`:
564	if ((check & (`0x1` << `1`)))
565	{
566	/ Get maximum number of logical processors*
567	sharing L3 cache. /*
568	threads_l3 = (eax >> `14`) & `0x3ff`;
569
570	/ Check if L2 and L3 caches are inclusive. /
571	inclusive_cache = (edx & `0x2`) != `0`;
572	check &= ~(`0x1` << `1`);
573	}
574	break;
575	}
576	}
577	while (check);
578
579	/ If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum*
580	numbers of addressable IDs for logical processors sharing
581	the cache, instead of the maximum number of threads
582	sharing the cache. /*
583	if (max_cpuid >= `11` && support_count_mask)
584	{
585	/ Find the number of logical processors shipped in*
586	one core and apply count mask. /*
587	i = `0`;
588
589	/ Count SMT only if there is L3 cache. Always count*
590	core if there is no L3 cache. /*
591	int count = ((threads_l2 > `0` && level == `3`)
592	\| ((threads_l3 > `0`
593	\|\| (threads_l2 > `0` && level == `2`)) << `1`));
594
595	while (count)
596	{
597	__cpuid_count (`11`, i++, eax, ebx, ecx, edx);
598
599	int shipped = ebx & `0xff`;
600	int type = ecx & `0xff00`;
601	if (shipped == `0` \|\| type == `0`)
602	break;
603	else if (type == `0x100`)
604	{
605	/ Count SMT. /
606	if ((count & `0x1`))
607	{
608	int count_mask;
609
610	/ Compute count mask. /
611	asm ("bsr %1, %0"
612	: "=r" (count_mask) : "g" (threads_l2));
613	count_mask = ~(-`1` << (count_mask + `1`));
614	threads_l2 = (shipped - `1`) & count_mask;
615	count &= ~`0x1`;
616	}
617	}
618	else if (type == `0x200`)
619	{
620	/ Count core. /
621	if ((count & (`0x1` << `1`)))
622	{
623	int count_mask;
624	int threads_core
625	= (level == `2` ? threads_l2 : threads_l3);
626
627	/ Compute count mask. /
628	asm ("bsr %1, %0"
629	: "=r" (count_mask) : "g" (threads_core));
630	count_mask = ~(-`1` << (count_mask + `1`));
631	threads_core = (shipped - `1`) & count_mask;
632	if (level == `2`)
633	threads_l2 = threads_core;
634	else
635	threads_l3 = threads_core;
636	count &= ~(`0x1` << `1`);
637	}
638	}
639	}
640	}
641	if (threads_l2 > `0`)
642	threads_l2 += `1`;
643	if (threads_l3 > `0`)
644	threads_l3 += `1`;
645	if (level == `2`)
646	{
647	if (threads_l2)
648	{
649	threads = threads_l2;
650	if (cpu_features->basic.kind == arch_kind_intel
651	&& threads > `2`
652	&& family == `6`)
653	switch (model)
654	{
655	case `0x37`:
656	case `0x4a`:
657	case `0x4d`:
658	case `0x5a`:
659	case `0x5d`:
660	/ Silvermont has L2 cache shared by 2 cores. /
661	threads = `2`;
662	break;
663	default:
664	break;
665	}
666	}
667	}
668	else if (threads_l3)
669	threads = threads_l3;
670	}
671	else
672	{
673	intel_bug_no_cache_info:
674	/ Assume that all logical threads share the highest cache*
675	level. /*
676	threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> `16`)
677	& `0xff`);
678	}
679	/ Get per-thread size of highest level cache. /
680	if (shared_per_thread > `0` && threads > `0`)
681	shared_per_thread /= threads;
682	}
683
684	/ Account for non-inclusive L2 and L3 caches. /
685	if (!inclusive_cache)
686	{
687	long int core_per_thread = threads_l2 > `0` ? (core / threads_l2) : core;
688	shared_per_thread += core_per_thread;
689	shared += core;
690	}
691
692	*shared_ptr = shared;
693	*shared_per_thread_ptr = shared_per_thread;
694	*threads_ptr = threads;
695	}
696
697	static void
698	dl_init_cacheinfo (struct cpu_features *cpu_features)
699	{
700	/ Find out what brand of processor. /
701	unsigned int ebx;
702	unsigned int ecx;
703	unsigned int edx;
704	int max_cpuid_ex;
705	long int data = -`1`;
706	long int shared = -`1`;
707	long int shared_per_thread = -`1`;
708	long int core = -`1`;
709	unsigned int threads = `0`;
710	unsigned long int level1_icache_size = -`1`;
711	unsigned long int level1_icache_linesize = -`1`;
712	unsigned long int level1_dcache_size = -`1`;
713	unsigned long int level1_dcache_assoc = -`1`;
714	unsigned long int level1_dcache_linesize = -`1`;
715	unsigned long int level2_cache_size = -`1`;
716	unsigned long int level2_cache_assoc = -`1`;
717	unsigned long int level2_cache_linesize = -`1`;
718	unsigned long int level3_cache_size = -`1`;
719	unsigned long int level3_cache_assoc = -`1`;
720	unsigned long int level3_cache_linesize = -`1`;
721	unsigned long int level4_cache_size = -`1`;
722
723	if (cpu_features->basic.kind == arch_kind_intel)
724	{
725	data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
726	core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
727	shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
728	shared_per_thread = shared;
729
730	level1_icache_size
731	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
732	level1_icache_linesize
733	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
734	level1_dcache_size = data;
735	level1_dcache_assoc
736	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
737	level1_dcache_linesize
738	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
739	level2_cache_size = core;
740	level2_cache_assoc
741	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
742	level2_cache_linesize
743	= handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
744	level3_cache_size = shared;
745	level3_cache_assoc
746	= handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
747	level3_cache_linesize
748	= handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
749	level4_cache_size
750	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
751
752	get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads, core);
753	}
754	else if (cpu_features->basic.kind == arch_kind_zhaoxin)
755	{
756	data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
757	core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
758	shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
759	shared_per_thread = shared;
760
761	level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
762	level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
763	level1_dcache_size = data;
764	level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
765	level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
766	level2_cache_size = core;
767	level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
768	level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
769	level3_cache_size = shared;
770	level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
771	level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
772
773	get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads, core);
774	}
775	else if (cpu_features->basic.kind == arch_kind_amd)
776	{
777	data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
778	core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
779	shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
780	shared_per_thread = shared;
781
782	level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
783	level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
784	level1_dcache_size = data;
785	level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
786	level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
787	level2_cache_size = core;
788	level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
789	level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
790	level3_cache_size = shared;
791	level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
792	level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
793
794	/ Get maximum extended function. /
795	__cpuid (`0x80000000`, max_cpuid_ex, ebx, ecx, edx);
796
797	if (shared <= `0`)
798	{
799	/ No shared L3 cache. All we have is the L2 cache. /
800	shared = core;
801	shared_per_thread = core;
802	}
803	else
804	{
805	/ Figure out the number of logical threads that share L3. /
806	if (max_cpuid_ex >= `0x80000008`)
807	{
808	/ Get width of APIC ID. /
809	__cpuid (`0x80000008`, max_cpuid_ex, ebx, ecx, edx);
810	threads = `1` << ((ecx >> `12`) & `0x0f`);
811	}
812
813	if (threads == `0` \|\| cpu_features->basic.family >= `0x17`)
814	{
815	/ If APIC ID width is not available, use logical*
816	processor count. /*
817	__cpuid (`0x00000001`, max_cpuid_ex, ebx, ecx, edx);
818
819	if ((edx & (`1` << `28`)) != `0`)
820	threads = (ebx >> `16`) & `0xff`;
821	}
822
823	/ Cap usage of highest cache level to the number of*
824	supported threads. /*
825	if (threads > `0`)
826	shared_per_thread /= threads;
827
828	/ Get shared cache per ccx for Zen architectures. /
829	if (cpu_features->basic.family >= `0x17`)
830	{
831	unsigned int eax;
832
833	/ Get number of threads share the L3 cache in CCX. /
834	__cpuid_count (`0x8000001D`, `0x3`, eax, ebx, ecx, edx);
835
836	unsigned int threads_per_ccx = ((eax >> `14`) & `0xfff`) + `1`;
837	shared_per_thread *= threads_per_ccx;
838	}
839	else
840	{
841	/ Account for exclusive L2 and L3 caches. /
842	shared += core;
843	shared_per_thread += core;
844	}
845	}
846	}
847
848	cpu_features->level1_icache_size = level1_icache_size;
849	cpu_features->level1_icache_linesize = level1_icache_linesize;
850	cpu_features->level1_dcache_size = level1_dcache_size;
851	cpu_features->level1_dcache_assoc = level1_dcache_assoc;
852	cpu_features->level1_dcache_linesize = level1_dcache_linesize;
853	cpu_features->level2_cache_size = level2_cache_size;
854	cpu_features->level2_cache_assoc = level2_cache_assoc;
855	cpu_features->level2_cache_linesize = level2_cache_linesize;
856	cpu_features->level3_cache_size = level3_cache_size;
857	cpu_features->level3_cache_assoc = level3_cache_assoc;
858	cpu_features->level3_cache_linesize = level3_cache_linesize;
859	cpu_features->level4_cache_size = level4_cache_size;
860
861	/ The default setting for the non_temporal threshold is 1/4 of size*
862	of the chip's cache. For most Intel and AMD processors with an
863	initial release date between 2017 and 2023, a thread's typical
864	share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
865	estimate the point where non-temporal stores begin out-competing
866	REP MOVSB. As well the point where the fact that non-temporal
867	stores are forced back to main memory would already occurred to the
868	majority of the lines in the copy. Note, concerns about the
869	entire L3 cache being evicted by the copy are mostly alleviated
870	by the fact that modern HW detects streaming patterns and
871	provides proper LRU hints so that the maximum thrashing
872	capped at 1/associativity. /*
873	unsigned long int non_temporal_threshold = shared / `4`;
874
875	/ If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most*
876	likely have incorrect/incomplete cache info in which case, default to
877	3/4 per-thread L3 to avoid regressions. /
878	unsigned long int non_temporal_threshold_lowbound
879	= shared_per_thread * `3` / `4`;
880	if (non_temporal_threshold < non_temporal_threshold_lowbound)
881	non_temporal_threshold = non_temporal_threshold_lowbound;
882
883	/ If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run*
884	a higher risk of actually thrashing the cache as they don't have a HW LRU
885	hint. As well, their performance in highly parallel situations is
886	noticeably worse. /*
887	if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
888	non_temporal_threshold = non_temporal_threshold_lowbound;
889	/ SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of*
890	'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
891	if that operation cannot overflow. Minimum of 0x4040 (16448) because the
892	L(large_memset_4x) loops need 64-byte to cache align and enough space for
893	at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
894	reflected in the manual. /*
895	unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> `4`;
896	unsigned long int minimum_non_temporal_threshold = `0x4040`;
897	if (non_temporal_threshold < minimum_non_temporal_threshold)
898	non_temporal_threshold = minimum_non_temporal_threshold;
899	else if (non_temporal_threshold > maximum_non_temporal_threshold)
900	non_temporal_threshold = maximum_non_temporal_threshold;
901
902	#if HAVE_TUNABLES
903	/ NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. /
904	unsigned int minimum_rep_movsb_threshold;
905	#endif
906	/ NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for*
907	VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
908	threshold is 2048 (VEC_SIZE / 16). /
909	unsigned int rep_movsb_threshold;
910	if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
911	&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
912	{
913	rep_movsb_threshold = `4096` * (`64` / `16`);
914	#if HAVE_TUNABLES
915	minimum_rep_movsb_threshold = `64` * `8`;
916	#endif
917	}
918	else if (CPU_FEATURE_PREFERRED_P (cpu_features,
919	AVX_Fast_Unaligned_Load))
920	{
921	rep_movsb_threshold = `4096` * (`32` / `16`);
922	#if HAVE_TUNABLES
923	minimum_rep_movsb_threshold = `32` * `8`;
924	#endif
925	}
926	else
927	{
928	rep_movsb_threshold = `2048` * (`16` / `16`);
929	#if HAVE_TUNABLES
930	minimum_rep_movsb_threshold = `16` * `8`;
931	#endif
932	}
933	/ NB: The default REP MOVSB threshold is 2112 on processors with fast*
934	short REP MOVSB (FSRM). /*
935	if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
936	rep_movsb_threshold = `2112`;
937
938	/ The default threshold to use Enhanced REP STOSB. /
939	unsigned long int rep_stosb_threshold = `2048`;
940
941	#if HAVE_TUNABLES
942	long int tunable_size;
943
944	tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
945	/ NB: Ignore the default value 0. /
946	if (tunable_size != `0`)
947	data = tunable_size;
948
949	tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
950	/ NB: Ignore the default value 0. /
951	if (tunable_size != `0`)
952	shared = tunable_size;
953
954	tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
955	if (tunable_size > minimum_non_temporal_threshold
956	&& tunable_size <= maximum_non_temporal_threshold)
957	non_temporal_threshold = tunable_size;
958
959	tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
960	if (tunable_size > minimum_rep_movsb_threshold)
961	rep_movsb_threshold = tunable_size;
962
963	/ NB: The default value of the x86_rep_stosb_threshold tunable is the*
964	same as the default value of __x86_rep_stosb_threshold and the
965	minimum value is fixed. /*
966	rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
967	long int, NULL);
968
969	TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, `0`, SIZE_MAX);
970	TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, `0`, SIZE_MAX);
971	TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
972	minimum_non_temporal_threshold,
973	maximum_non_temporal_threshold);
974	TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
975	minimum_rep_movsb_threshold, SIZE_MAX);
976	TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, `1`,
977	SIZE_MAX);
978	#endif
979
980	unsigned long int rep_movsb_stop_threshold;
981	/ ERMS feature is implemented from AMD Zen3 architecture and it is*
982	performing poorly for data above L2 cache size. Henceforth, adding
983	an upper bound threshold parameter to limit the usage of Enhanced
984	REP MOVSB operations and setting its value to L2 cache size. /*
985	if (cpu_features->basic.kind == arch_kind_amd)
986	rep_movsb_stop_threshold = core;
987	/ Setting the upper bound of ERMS to the computed value of*
988	non-temporal threshold for architectures other than AMD. /*
989	else
990	rep_movsb_stop_threshold = non_temporal_threshold;
991
992	cpu_features->data_cache_size = data;
993	cpu_features->shared_cache_size = shared;
994	cpu_features->non_temporal_threshold = non_temporal_threshold;
995	cpu_features->rep_movsb_threshold = rep_movsb_threshold;
996	cpu_features->rep_stosb_threshold = rep_stosb_threshold;
997	cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
998	}
999

source code of glibc/sysdeps/x86/dl-cacheinfo.h