1/* Initialize x86 cache info.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19static const struct intel_02_cache_info
20{
21 unsigned char idx;
22 unsigned char assoc;
23 unsigned char linesize;
24 unsigned char rel_name;
25 unsigned int size;
26} intel_02_known [] =
27 {
28#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
30 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
31 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
32 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
33 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
34 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
35 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
36 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
37 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
38 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
39 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
40 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
41 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
42 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
44 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
45 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
46 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
47 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
48 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
49 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
50 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
51 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
52 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
53 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
54 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
55 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
56 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
57 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
58 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
59 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
60 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
61 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
62 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
63 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
64 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
65 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
66 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
67 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
68 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
69 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
70 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
71 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
72 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
73 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
74 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
75 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
76 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
77 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
78 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
79 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
80 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
83 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
84 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
85 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
86 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
87 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
88 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
89 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
90 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
91 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
92 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
93 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
94 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
95 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
96 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
97 };
98
99#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101static int
102intel_02_known_compare (const void *p1, const void *p2)
103{
104 const struct intel_02_cache_info *i1;
105 const struct intel_02_cache_info *i2;
106
107 i1 = (const struct intel_02_cache_info *) p1;
108 i2 = (const struct intel_02_cache_info *) p2;
109
110 if (i1->idx == i2->idx)
111 return 0;
112
113 return i1->idx < i2->idx ? -1 : 1;
114}
115
116
117static long int
118__attribute__ ((noinline))
119intel_check_word (int name, unsigned int value, bool *has_level_2,
120 bool *no_level_2_or_3,
121 const struct cpu_features *cpu_features)
122{
123 if ((value & 0x80000000) != 0)
124 /* The register value is reserved. */
125 return 0;
126
127 /* Fold the name. The _SC_ constants are always in the order SIZE,
128 ASSOC, LINESIZE. */
129 int folded_rel_name = (M(name) / 3) * 3;
130
131 while (value != 0)
132 {
133 unsigned int byte = value & 0xff;
134
135 if (byte == 0x40)
136 {
137 *no_level_2_or_3 = true;
138
139 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140 /* No need to look further. */
141 break;
142 }
143 else if (byte == 0xff)
144 {
145 /* CPUID leaf 0x4 contains all the information. We need to
146 iterate over it. */
147 unsigned int eax;
148 unsigned int ebx;
149 unsigned int ecx;
150 unsigned int edx;
151
152 unsigned int round = 0;
153 while (1)
154 {
155 __cpuid_count (4, round, eax, ebx, ecx, edx);
156
157 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
158 if (type == null)
159 /* That was the end. */
160 break;
161
162 unsigned int level = (eax >> 5) & 0x7;
163
164 if ((level == 1 && type == data
165 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166 || (level == 1 && type == inst
167 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171 {
172 unsigned int offset = M(name) - folded_rel_name;
173
174 if (offset == 0)
175 /* Cache size. */
176 return (((ebx >> 22) + 1)
177 * (((ebx >> 12) & 0x3ff) + 1)
178 * ((ebx & 0xfff) + 1)
179 * (ecx + 1));
180 if (offset == 1)
181 return (ebx >> 22) + 1;
182
183 assert (offset == 2);
184 return (ebx & 0xfff) + 1;
185 }
186
187 ++round;
188 }
189 /* There is no other cache information anywhere else. */
190 break;
191 }
192 else
193 {
194 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195 {
196 /* Intel reused this value. For family 15, model 6 it
197 specifies the 3rd level cache. Otherwise the 2nd
198 level cache. */
199 unsigned int family = cpu_features->basic.family;
200 unsigned int model = cpu_features->basic.model;
201
202 if (family == 15 && model == 6)
203 {
204 /* The level 3 cache is encoded for this model like
205 the level 2 cache is for other models. Pretend
206 the caller asked for the level 2 cache. */
207 name = (_SC_LEVEL2_CACHE_SIZE
208 + (name - _SC_LEVEL3_CACHE_SIZE));
209 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210 }
211 }
212
213 struct intel_02_cache_info *found;
214 struct intel_02_cache_info search;
215
216 search.idx = byte;
217 found = bsearch (&search, intel_02_known, nintel_02_known,
218 sizeof (intel_02_known[0]), intel_02_known_compare);
219 if (found != NULL)
220 {
221 if (found->rel_name == folded_rel_name)
222 {
223 unsigned int offset = M(name) - folded_rel_name;
224
225 if (offset == 0)
226 /* Cache size. */
227 return found->size;
228 if (offset == 1)
229 return found->assoc;
230
231 assert (offset == 2);
232 return found->linesize;
233 }
234
235 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236 *has_level_2 = true;
237 }
238 }
239
240 /* Next byte for the next round. */
241 value >>= 8;
242 }
243
244 /* Nothing found. */
245 return 0;
246}
247
248
249static long int __attribute__ ((noinline))
250handle_intel (int name, const struct cpu_features *cpu_features)
251{
252 unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254 /* Return -1 for older CPUs. */
255 if (maxidx < 2)
256 return -1;
257
258 /* OK, we can use the CPUID instruction to get all info about the
259 caches. */
260 unsigned int cnt = 0;
261 unsigned int max = 1;
262 long int result = 0;
263 bool no_level_2_or_3 = false;
264 bool has_level_2 = false;
265
266 while (cnt++ < max)
267 {
268 unsigned int eax;
269 unsigned int ebx;
270 unsigned int ecx;
271 unsigned int edx;
272 __cpuid (2, eax, ebx, ecx, edx);
273
274 /* The low byte of EAX in the first round contain the number of
275 rounds we have to make. At least one, the one we are already
276 doing. */
277 if (cnt == 1)
278 {
279 max = eax & 0xff;
280 eax &= 0xffffff00;
281 }
282
283 /* Process the individual registers' value. */
284 result = intel_check_word (name, value: eax, has_level_2: &has_level_2,
285 no_level_2_or_3: &no_level_2_or_3, cpu_features);
286 if (result != 0)
287 return result;
288
289 result = intel_check_word (name, value: ebx, has_level_2: &has_level_2,
290 no_level_2_or_3: &no_level_2_or_3, cpu_features);
291 if (result != 0)
292 return result;
293
294 result = intel_check_word (name, value: ecx, has_level_2: &has_level_2,
295 no_level_2_or_3: &no_level_2_or_3, cpu_features);
296 if (result != 0)
297 return result;
298
299 result = intel_check_word (name, value: edx, has_level_2: &has_level_2,
300 no_level_2_or_3: &no_level_2_or_3, cpu_features);
301 if (result != 0)
302 return result;
303 }
304
305 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306 && no_level_2_or_3)
307 return -1;
308
309 return 0;
310}
311
312
313static long int __attribute__ ((noinline))
314handle_amd (int name)
315{
316 unsigned int eax;
317 unsigned int ebx;
318 unsigned int ecx;
319 unsigned int edx;
320 __cpuid (0x80000000, eax, ebx, ecx, edx);
321
322 /* No level 4 cache (yet). */
323 if (name > _SC_LEVEL3_CACHE_LINESIZE)
324 return 0;
325
326 unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
327 if (eax < fn)
328 return 0;
329
330 __cpuid (fn, eax, ebx, ecx, edx);
331
332 if (name < _SC_LEVEL1_DCACHE_SIZE)
333 {
334 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
335 ecx = edx;
336 }
337
338 switch (name)
339 {
340 case _SC_LEVEL1_DCACHE_SIZE:
341 return (ecx >> 14) & 0x3fc00;
342
343 case _SC_LEVEL1_DCACHE_ASSOC:
344 ecx >>= 16;
345 if ((ecx & 0xff) == 0xff)
346 /* Fully associative. */
347 return (ecx << 2) & 0x3fc00;
348 return ecx & 0xff;
349
350 case _SC_LEVEL1_DCACHE_LINESIZE:
351 return ecx & 0xff;
352
353 case _SC_LEVEL2_CACHE_SIZE:
354 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
355
356 case _SC_LEVEL2_CACHE_ASSOC:
357 switch ((ecx >> 12) & 0xf)
358 {
359 case 0:
360 case 1:
361 case 2:
362 case 4:
363 return (ecx >> 12) & 0xf;
364 case 6:
365 return 8;
366 case 8:
367 return 16;
368 case 10:
369 return 32;
370 case 11:
371 return 48;
372 case 12:
373 return 64;
374 case 13:
375 return 96;
376 case 14:
377 return 128;
378 case 15:
379 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
380 default:
381 return 0;
382 }
383 /* NOTREACHED */
384
385 case _SC_LEVEL2_CACHE_LINESIZE:
386 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
387
388 case _SC_LEVEL3_CACHE_SIZE:
389 return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
390
391 case _SC_LEVEL3_CACHE_ASSOC:
392 switch ((edx >> 12) & 0xf)
393 {
394 case 0:
395 case 1:
396 case 2:
397 case 4:
398 return (edx >> 12) & 0xf;
399 case 6:
400 return 8;
401 case 8:
402 return 16;
403 case 10:
404 return 32;
405 case 11:
406 return 48;
407 case 12:
408 return 64;
409 case 13:
410 return 96;
411 case 14:
412 return 128;
413 case 15:
414 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
415 default:
416 return 0;
417 }
418 /* NOTREACHED */
419
420 case _SC_LEVEL3_CACHE_LINESIZE:
421 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
422
423 default:
424 assert (! "cannot happen");
425 }
426 return -1;
427}
428
429
430static long int __attribute__ ((noinline))
431handle_zhaoxin (int name)
432{
433 unsigned int eax;
434 unsigned int ebx;
435 unsigned int ecx;
436 unsigned int edx;
437
438 int folded_rel_name = (M(name) / 3) * 3;
439
440 unsigned int round = 0;
441 while (1)
442 {
443 __cpuid_count (4, round, eax, ebx, ecx, edx);
444
445 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
446 if (type == null)
447 break;
448
449 unsigned int level = (eax >> 5) & 0x7;
450
451 if ((level == 1 && type == data
452 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
453 || (level == 1 && type == inst
454 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
455 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
456 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
457 {
458 unsigned int offset = M(name) - folded_rel_name;
459
460 if (offset == 0)
461 /* Cache size. */
462 return (((ebx >> 22) + 1)
463 * (((ebx >> 12) & 0x3ff) + 1)
464 * ((ebx & 0xfff) + 1)
465 * (ecx + 1));
466 if (offset == 1)
467 return (ebx >> 22) + 1;
468
469 assert (offset == 2);
470 return (ebx & 0xfff) + 1;
471 }
472
473 ++round;
474 }
475
476 /* Nothing found. */
477 return 0;
478}
479
480static void
481get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
482 long int core)
483{
484 unsigned int eax;
485 unsigned int ebx;
486 unsigned int ecx;
487 unsigned int edx;
488
489 /* Number of logical processors sharing L2 cache. */
490 int threads_l2;
491
492 /* Number of logical processors sharing L3 cache. */
493 int threads_l3;
494
495 const struct cpu_features *cpu_features = __get_cpu_features ();
496 int max_cpuid = cpu_features->basic.max_cpuid;
497 unsigned int family = cpu_features->basic.family;
498 unsigned int model = cpu_features->basic.model;
499 long int shared = *shared_ptr;
500 long int shared_per_thread = *shared_per_thread_ptr;
501 unsigned int threads = *threads_ptr;
502 bool inclusive_cache = true;
503 bool support_count_mask = true;
504
505 /* Try L3 first. */
506 unsigned int level = 3;
507
508 if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
509 support_count_mask = false;
510
511 if (shared <= 0)
512 {
513 /* Try L2 otherwise. */
514 level = 2;
515 shared = core;
516 shared_per_thread = core;
517 threads_l2 = 0;
518 threads_l3 = -1;
519 }
520 else
521 {
522 threads_l2 = 0;
523 threads_l3 = 0;
524 }
525
526 /* A value of 0 for the HTT bit indicates there is only a single
527 logical processor. */
528 if (HAS_CPU_FEATURE (HTT))
529 {
530 /* Figure out the number of logical threads that share the
531 highest cache level. */
532 if (max_cpuid >= 4)
533 {
534 int i = 0;
535
536 /* Query until cache level 2 and 3 are enumerated. */
537 int check = 0x1 | (threads_l3 == 0) << 1;
538 do
539 {
540 __cpuid_count (4, i++, eax, ebx, ecx, edx);
541
542 /* There seems to be a bug in at least some Pentium Ds
543 which sometimes fail to iterate all cache parameters.
544 Do not loop indefinitely here, stop in this case and
545 assume there is no such information. */
546 if (cpu_features->basic.kind == arch_kind_intel
547 && (eax & 0x1f) == 0 )
548 goto intel_bug_no_cache_info;
549
550 switch ((eax >> 5) & 0x7)
551 {
552 default:
553 break;
554 case 2:
555 if ((check & 0x1))
556 {
557 /* Get maximum number of logical processors
558 sharing L2 cache. */
559 threads_l2 = (eax >> 14) & 0x3ff;
560 check &= ~0x1;
561 }
562 break;
563 case 3:
564 if ((check & (0x1 << 1)))
565 {
566 /* Get maximum number of logical processors
567 sharing L3 cache. */
568 threads_l3 = (eax >> 14) & 0x3ff;
569
570 /* Check if L2 and L3 caches are inclusive. */
571 inclusive_cache = (edx & 0x2) != 0;
572 check &= ~(0x1 << 1);
573 }
574 break;
575 }
576 }
577 while (check);
578
579 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
580 numbers of addressable IDs for logical processors sharing
581 the cache, instead of the maximum number of threads
582 sharing the cache. */
583 if (max_cpuid >= 11 && support_count_mask)
584 {
585 /* Find the number of logical processors shipped in
586 one core and apply count mask. */
587 i = 0;
588
589 /* Count SMT only if there is L3 cache. Always count
590 core if there is no L3 cache. */
591 int count = ((threads_l2 > 0 && level == 3)
592 | ((threads_l3 > 0
593 || (threads_l2 > 0 && level == 2)) << 1));
594
595 while (count)
596 {
597 __cpuid_count (11, i++, eax, ebx, ecx, edx);
598
599 int shipped = ebx & 0xff;
600 int type = ecx & 0xff00;
601 if (shipped == 0 || type == 0)
602 break;
603 else if (type == 0x100)
604 {
605 /* Count SMT. */
606 if ((count & 0x1))
607 {
608 int count_mask;
609
610 /* Compute count mask. */
611 asm ("bsr %1, %0"
612 : "=r" (count_mask) : "g" (threads_l2));
613 count_mask = ~(-1 << (count_mask + 1));
614 threads_l2 = (shipped - 1) & count_mask;
615 count &= ~0x1;
616 }
617 }
618 else if (type == 0x200)
619 {
620 /* Count core. */
621 if ((count & (0x1 << 1)))
622 {
623 int count_mask;
624 int threads_core
625 = (level == 2 ? threads_l2 : threads_l3);
626
627 /* Compute count mask. */
628 asm ("bsr %1, %0"
629 : "=r" (count_mask) : "g" (threads_core));
630 count_mask = ~(-1 << (count_mask + 1));
631 threads_core = (shipped - 1) & count_mask;
632 if (level == 2)
633 threads_l2 = threads_core;
634 else
635 threads_l3 = threads_core;
636 count &= ~(0x1 << 1);
637 }
638 }
639 }
640 }
641 if (threads_l2 > 0)
642 threads_l2 += 1;
643 if (threads_l3 > 0)
644 threads_l3 += 1;
645 if (level == 2)
646 {
647 if (threads_l2)
648 {
649 threads = threads_l2;
650 if (cpu_features->basic.kind == arch_kind_intel
651 && threads > 2
652 && family == 6)
653 switch (model)
654 {
655 case 0x37:
656 case 0x4a:
657 case 0x4d:
658 case 0x5a:
659 case 0x5d:
660 /* Silvermont has L2 cache shared by 2 cores. */
661 threads = 2;
662 break;
663 default:
664 break;
665 }
666 }
667 }
668 else if (threads_l3)
669 threads = threads_l3;
670 }
671 else
672 {
673 intel_bug_no_cache_info:
674 /* Assume that all logical threads share the highest cache
675 level. */
676 threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
677 & 0xff);
678 }
679 /* Get per-thread size of highest level cache. */
680 if (shared_per_thread > 0 && threads > 0)
681 shared_per_thread /= threads;
682 }
683
684 /* Account for non-inclusive L2 and L3 caches. */
685 if (!inclusive_cache)
686 {
687 long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
688 shared_per_thread += core_per_thread;
689 shared += core;
690 }
691
692 *shared_ptr = shared;
693 *shared_per_thread_ptr = shared_per_thread;
694 *threads_ptr = threads;
695}
696
697static void
698dl_init_cacheinfo (struct cpu_features *cpu_features)
699{
700 /* Find out what brand of processor. */
701 unsigned int ebx;
702 unsigned int ecx;
703 unsigned int edx;
704 int max_cpuid_ex;
705 long int data = -1;
706 long int shared = -1;
707 long int shared_per_thread = -1;
708 long int core = -1;
709 unsigned int threads = 0;
710 unsigned long int level1_icache_size = -1;
711 unsigned long int level1_icache_linesize = -1;
712 unsigned long int level1_dcache_size = -1;
713 unsigned long int level1_dcache_assoc = -1;
714 unsigned long int level1_dcache_linesize = -1;
715 unsigned long int level2_cache_size = -1;
716 unsigned long int level2_cache_assoc = -1;
717 unsigned long int level2_cache_linesize = -1;
718 unsigned long int level3_cache_size = -1;
719 unsigned long int level3_cache_assoc = -1;
720 unsigned long int level3_cache_linesize = -1;
721 unsigned long int level4_cache_size = -1;
722
723 if (cpu_features->basic.kind == arch_kind_intel)
724 {
725 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
726 core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
727 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
728 shared_per_thread = shared;
729
730 level1_icache_size
731 = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
732 level1_icache_linesize
733 = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
734 level1_dcache_size = data;
735 level1_dcache_assoc
736 = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
737 level1_dcache_linesize
738 = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
739 level2_cache_size = core;
740 level2_cache_assoc
741 = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
742 level2_cache_linesize
743 = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
744 level3_cache_size = shared;
745 level3_cache_assoc
746 = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
747 level3_cache_linesize
748 = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
749 level4_cache_size
750 = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
751
752 get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads, core);
753 }
754 else if (cpu_features->basic.kind == arch_kind_zhaoxin)
755 {
756 data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
757 core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
758 shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
759 shared_per_thread = shared;
760
761 level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
762 level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
763 level1_dcache_size = data;
764 level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
765 level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
766 level2_cache_size = core;
767 level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
768 level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
769 level3_cache_size = shared;
770 level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
771 level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
772
773 get_common_cache_info (shared_ptr: &shared, shared_per_thread_ptr: &shared_per_thread, threads_ptr: &threads, core);
774 }
775 else if (cpu_features->basic.kind == arch_kind_amd)
776 {
777 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
778 core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
779 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
780 shared_per_thread = shared;
781
782 level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
783 level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
784 level1_dcache_size = data;
785 level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
786 level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
787 level2_cache_size = core;
788 level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
789 level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
790 level3_cache_size = shared;
791 level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
792 level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
793
794 /* Get maximum extended function. */
795 __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
796
797 if (shared <= 0)
798 {
799 /* No shared L3 cache. All we have is the L2 cache. */
800 shared = core;
801 shared_per_thread = core;
802 }
803 else
804 {
805 /* Figure out the number of logical threads that share L3. */
806 if (max_cpuid_ex >= 0x80000008)
807 {
808 /* Get width of APIC ID. */
809 __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
810 threads = 1 << ((ecx >> 12) & 0x0f);
811 }
812
813 if (threads == 0 || cpu_features->basic.family >= 0x17)
814 {
815 /* If APIC ID width is not available, use logical
816 processor count. */
817 __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
818
819 if ((edx & (1 << 28)) != 0)
820 threads = (ebx >> 16) & 0xff;
821 }
822
823 /* Cap usage of highest cache level to the number of
824 supported threads. */
825 if (threads > 0)
826 shared_per_thread /= threads;
827
828 /* Get shared cache per ccx for Zen architectures. */
829 if (cpu_features->basic.family >= 0x17)
830 {
831 unsigned int eax;
832
833 /* Get number of threads share the L3 cache in CCX. */
834 __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
835
836 unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
837 shared_per_thread *= threads_per_ccx;
838 }
839 else
840 {
841 /* Account for exclusive L2 and L3 caches. */
842 shared += core;
843 shared_per_thread += core;
844 }
845 }
846 }
847
848 cpu_features->level1_icache_size = level1_icache_size;
849 cpu_features->level1_icache_linesize = level1_icache_linesize;
850 cpu_features->level1_dcache_size = level1_dcache_size;
851 cpu_features->level1_dcache_assoc = level1_dcache_assoc;
852 cpu_features->level1_dcache_linesize = level1_dcache_linesize;
853 cpu_features->level2_cache_size = level2_cache_size;
854 cpu_features->level2_cache_assoc = level2_cache_assoc;
855 cpu_features->level2_cache_linesize = level2_cache_linesize;
856 cpu_features->level3_cache_size = level3_cache_size;
857 cpu_features->level3_cache_assoc = level3_cache_assoc;
858 cpu_features->level3_cache_linesize = level3_cache_linesize;
859 cpu_features->level4_cache_size = level4_cache_size;
860
861 /* The default setting for the non_temporal threshold is 1/4 of size
862 of the chip's cache. For most Intel and AMD processors with an
863 initial release date between 2017 and 2023, a thread's typical
864 share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
865 estimate the point where non-temporal stores begin out-competing
866 REP MOVSB. As well the point where the fact that non-temporal
867 stores are forced back to main memory would already occurred to the
868 majority of the lines in the copy. Note, concerns about the
869 entire L3 cache being evicted by the copy are mostly alleviated
870 by the fact that modern HW detects streaming patterns and
871 provides proper LRU hints so that the maximum thrashing
872 capped at 1/associativity. */
873 unsigned long int non_temporal_threshold = shared / 4;
874
875 /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
876 likely have incorrect/incomplete cache info in which case, default to
877 3/4 * per-thread L3 to avoid regressions. */
878 unsigned long int non_temporal_threshold_lowbound
879 = shared_per_thread * 3 / 4;
880 if (non_temporal_threshold < non_temporal_threshold_lowbound)
881 non_temporal_threshold = non_temporal_threshold_lowbound;
882
883 /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
884 a higher risk of actually thrashing the cache as they don't have a HW LRU
885 hint. As well, their performance in highly parallel situations is
886 noticeably worse. */
887 if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
888 non_temporal_threshold = non_temporal_threshold_lowbound;
889 /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
890 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
891 if that operation cannot overflow. Minimum of 0x4040 (16448) because the
892 L(large_memset_4x) loops need 64-byte to cache align and enough space for
893 at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
894 reflected in the manual. */
895 unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
896 unsigned long int minimum_non_temporal_threshold = 0x4040;
897 if (non_temporal_threshold < minimum_non_temporal_threshold)
898 non_temporal_threshold = minimum_non_temporal_threshold;
899 else if (non_temporal_threshold > maximum_non_temporal_threshold)
900 non_temporal_threshold = maximum_non_temporal_threshold;
901
902#if HAVE_TUNABLES
903 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
904 unsigned int minimum_rep_movsb_threshold;
905#endif
906 /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
907 VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
908 threshold is 2048 * (VEC_SIZE / 16). */
909 unsigned int rep_movsb_threshold;
910 if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
911 && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
912 {
913 rep_movsb_threshold = 4096 * (64 / 16);
914#if HAVE_TUNABLES
915 minimum_rep_movsb_threshold = 64 * 8;
916#endif
917 }
918 else if (CPU_FEATURE_PREFERRED_P (cpu_features,
919 AVX_Fast_Unaligned_Load))
920 {
921 rep_movsb_threshold = 4096 * (32 / 16);
922#if HAVE_TUNABLES
923 minimum_rep_movsb_threshold = 32 * 8;
924#endif
925 }
926 else
927 {
928 rep_movsb_threshold = 2048 * (16 / 16);
929#if HAVE_TUNABLES
930 minimum_rep_movsb_threshold = 16 * 8;
931#endif
932 }
933 /* NB: The default REP MOVSB threshold is 2112 on processors with fast
934 short REP MOVSB (FSRM). */
935 if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
936 rep_movsb_threshold = 2112;
937
938 /* The default threshold to use Enhanced REP STOSB. */
939 unsigned long int rep_stosb_threshold = 2048;
940
941#if HAVE_TUNABLES
942 long int tunable_size;
943
944 tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
945 /* NB: Ignore the default value 0. */
946 if (tunable_size != 0)
947 data = tunable_size;
948
949 tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
950 /* NB: Ignore the default value 0. */
951 if (tunable_size != 0)
952 shared = tunable_size;
953
954 tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
955 if (tunable_size > minimum_non_temporal_threshold
956 && tunable_size <= maximum_non_temporal_threshold)
957 non_temporal_threshold = tunable_size;
958
959 tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
960 if (tunable_size > minimum_rep_movsb_threshold)
961 rep_movsb_threshold = tunable_size;
962
963 /* NB: The default value of the x86_rep_stosb_threshold tunable is the
964 same as the default value of __x86_rep_stosb_threshold and the
965 minimum value is fixed. */
966 rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
967 long int, NULL);
968
969 TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
970 TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
971 TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
972 minimum_non_temporal_threshold,
973 maximum_non_temporal_threshold);
974 TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
975 minimum_rep_movsb_threshold, SIZE_MAX);
976 TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
977 SIZE_MAX);
978#endif
979
980 unsigned long int rep_movsb_stop_threshold;
981 /* ERMS feature is implemented from AMD Zen3 architecture and it is
982 performing poorly for data above L2 cache size. Henceforth, adding
983 an upper bound threshold parameter to limit the usage of Enhanced
984 REP MOVSB operations and setting its value to L2 cache size. */
985 if (cpu_features->basic.kind == arch_kind_amd)
986 rep_movsb_stop_threshold = core;
987 /* Setting the upper bound of ERMS to the computed value of
988 non-temporal threshold for architectures other than AMD. */
989 else
990 rep_movsb_stop_threshold = non_temporal_threshold;
991
992 cpu_features->data_cache_size = data;
993 cpu_features->shared_cache_size = shared;
994 cpu_features->non_temporal_threshold = non_temporal_threshold;
995 cpu_features->rep_movsb_threshold = rep_movsb_threshold;
996 cpu_features->rep_stosb_threshold = rep_stosb_threshold;
997 cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
998}
999

source code of glibc/sysdeps/x86/dl-cacheinfo.h