1 /* Initialize x86 cache info.
2 Copyright (C) 2020-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 static const struct intel_02_cache_info
20 {
21 unsigned char idx;
22 unsigned char assoc;
23 unsigned char linesize;
24 unsigned char rel_name;
25 unsigned int size;
26 } intel_02_known [] =
27 {
28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
30 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
31 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
32 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
33 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
34 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
35 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
36 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
37 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
38 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
39 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
40 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
41 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
42 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
44 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
45 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
46 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
47 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
48 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
49 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
50 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
51 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
52 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
53 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
54 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
55 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
56 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
57 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
58 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
59 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
60 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
61 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
62 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
63 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
64 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
65 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
66 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
67 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
68 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
69 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
70 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
71 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
72 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
73 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
74 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
75 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
76 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
77 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
78 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
79 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
80 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
81 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
82 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
83 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
84 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
85 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
86 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
87 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
88 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
89 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
90 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
91 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
92 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
93 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
94 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
95 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
96 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
97 };
98
99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100
101 static int
intel_02_known_compare(const void * p1,const void * p2)102 intel_02_known_compare (const void *p1, const void *p2)
103 {
104 const struct intel_02_cache_info *i1;
105 const struct intel_02_cache_info *i2;
106
107 i1 = (const struct intel_02_cache_info *) p1;
108 i2 = (const struct intel_02_cache_info *) p2;
109
110 if (i1->idx == i2->idx)
111 return 0;
112
113 return i1->idx < i2->idx ? -1 : 1;
114 }
115
116
117 static long int
118 __attribute__ ((noinline))
intel_check_word(int name,unsigned int value,bool * has_level_2,bool * no_level_2_or_3,const struct cpu_features * cpu_features)119 intel_check_word (int name, unsigned int value, bool *has_level_2,
120 bool *no_level_2_or_3,
121 const struct cpu_features *cpu_features)
122 {
123 if ((value & 0x80000000) != 0)
124 /* The register value is reserved. */
125 return 0;
126
127 /* Fold the name. The _SC_ constants are always in the order SIZE,
128 ASSOC, LINESIZE. */
129 int folded_rel_name = (M(name) / 3) * 3;
130
131 while (value != 0)
132 {
133 unsigned int byte = value & 0xff;
134
135 if (byte == 0x40)
136 {
137 *no_level_2_or_3 = true;
138
139 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140 /* No need to look further. */
141 break;
142 }
143 else if (byte == 0xff)
144 {
145 /* CPUID leaf 0x4 contains all the information. We need to
146 iterate over it. */
147 unsigned int eax;
148 unsigned int ebx;
149 unsigned int ecx;
150 unsigned int edx;
151
152 unsigned int round = 0;
153 while (1)
154 {
155 __cpuid_count (4, round, eax, ebx, ecx, edx);
156
157 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
158 if (type == null)
159 /* That was the end. */
160 break;
161
162 unsigned int level = (eax >> 5) & 0x7;
163
164 if ((level == 1 && type == data
165 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166 || (level == 1 && type == inst
167 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171 {
172 unsigned int offset = M(name) - folded_rel_name;
173
174 if (offset == 0)
175 /* Cache size. */
176 return (((ebx >> 22) + 1)
177 * (((ebx >> 12) & 0x3ff) + 1)
178 * ((ebx & 0xfff) + 1)
179 * (ecx + 1));
180 if (offset == 1)
181 return (ebx >> 22) + 1;
182
183 assert (offset == 2);
184 return (ebx & 0xfff) + 1;
185 }
186
187 ++round;
188 }
189 /* There is no other cache information anywhere else. */
190 break;
191 }
192 else
193 {
194 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195 {
196 /* Intel reused this value. For family 15, model 6 it
197 specifies the 3rd level cache. Otherwise the 2nd
198 level cache. */
199 unsigned int family = cpu_features->basic.family;
200 unsigned int model = cpu_features->basic.model;
201
202 if (family == 15 && model == 6)
203 {
204 /* The level 3 cache is encoded for this model like
205 the level 2 cache is for other models. Pretend
206 the caller asked for the level 2 cache. */
207 name = (_SC_LEVEL2_CACHE_SIZE
208 + (name - _SC_LEVEL3_CACHE_SIZE));
209 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210 }
211 }
212
213 struct intel_02_cache_info *found;
214 struct intel_02_cache_info search;
215
216 search.idx = byte;
217 found = bsearch (&search, intel_02_known, nintel_02_known,
218 sizeof (intel_02_known[0]), intel_02_known_compare);
219 if (found != NULL)
220 {
221 if (found->rel_name == folded_rel_name)
222 {
223 unsigned int offset = M(name) - folded_rel_name;
224
225 if (offset == 0)
226 /* Cache size. */
227 return found->size;
228 if (offset == 1)
229 return found->assoc;
230
231 assert (offset == 2);
232 return found->linesize;
233 }
234
235 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236 *has_level_2 = true;
237 }
238 }
239
240 /* Next byte for the next round. */
241 value >>= 8;
242 }
243
244 /* Nothing found. */
245 return 0;
246 }
247
248
249 static long int __attribute__ ((noinline))
handle_intel(int name,const struct cpu_features * cpu_features)250 handle_intel (int name, const struct cpu_features *cpu_features)
251 {
252 unsigned int maxidx = cpu_features->basic.max_cpuid;
253
254 /* Return -1 for older CPUs. */
255 if (maxidx < 2)
256 return -1;
257
258 /* OK, we can use the CPUID instruction to get all info about the
259 caches. */
260 unsigned int cnt = 0;
261 unsigned int max = 1;
262 long int result = 0;
263 bool no_level_2_or_3 = false;
264 bool has_level_2 = false;
265
266 while (cnt++ < max)
267 {
268 unsigned int eax;
269 unsigned int ebx;
270 unsigned int ecx;
271 unsigned int edx;
272 __cpuid (2, eax, ebx, ecx, edx);
273
274 /* The low byte of EAX in the first round contain the number of
275 rounds we have to make. At least one, the one we are already
276 doing. */
277 if (cnt == 1)
278 {
279 max = eax & 0xff;
280 eax &= 0xffffff00;
281 }
282
283 /* Process the individual registers' value. */
284 result = intel_check_word (name, eax, &has_level_2,
285 &no_level_2_or_3, cpu_features);
286 if (result != 0)
287 return result;
288
289 result = intel_check_word (name, ebx, &has_level_2,
290 &no_level_2_or_3, cpu_features);
291 if (result != 0)
292 return result;
293
294 result = intel_check_word (name, ecx, &has_level_2,
295 &no_level_2_or_3, cpu_features);
296 if (result != 0)
297 return result;
298
299 result = intel_check_word (name, edx, &has_level_2,
300 &no_level_2_or_3, cpu_features);
301 if (result != 0)
302 return result;
303 }
304
305 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306 && no_level_2_or_3)
307 return -1;
308
309 return 0;
310 }
311
312
313 static long int __attribute__ ((noinline))
handle_amd(int name)314 handle_amd (int name)
315 {
316 unsigned int eax;
317 unsigned int ebx;
318 unsigned int ecx;
319 unsigned int edx;
320 __cpuid (0x80000000, eax, ebx, ecx, edx);
321
322 /* No level 4 cache (yet). */
323 if (name > _SC_LEVEL3_CACHE_LINESIZE)
324 return 0;
325
326 unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
327 if (eax < fn)
328 return 0;
329
330 __cpuid (fn, eax, ebx, ecx, edx);
331
332 if (name < _SC_LEVEL1_DCACHE_SIZE)
333 {
334 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
335 ecx = edx;
336 }
337
338 switch (name)
339 {
340 case _SC_LEVEL1_DCACHE_SIZE:
341 return (ecx >> 14) & 0x3fc00;
342
343 case _SC_LEVEL1_DCACHE_ASSOC:
344 ecx >>= 16;
345 if ((ecx & 0xff) == 0xff)
346 /* Fully associative. */
347 return (ecx << 2) & 0x3fc00;
348 return ecx & 0xff;
349
350 case _SC_LEVEL1_DCACHE_LINESIZE:
351 return ecx & 0xff;
352
353 case _SC_LEVEL2_CACHE_SIZE:
354 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
355
356 case _SC_LEVEL2_CACHE_ASSOC:
357 switch ((ecx >> 12) & 0xf)
358 {
359 case 0:
360 case 1:
361 case 2:
362 case 4:
363 return (ecx >> 12) & 0xf;
364 case 6:
365 return 8;
366 case 8:
367 return 16;
368 case 10:
369 return 32;
370 case 11:
371 return 48;
372 case 12:
373 return 64;
374 case 13:
375 return 96;
376 case 14:
377 return 128;
378 case 15:
379 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
380 default:
381 return 0;
382 }
383 /* NOTREACHED */
384
385 case _SC_LEVEL2_CACHE_LINESIZE:
386 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
387
388 case _SC_LEVEL3_CACHE_SIZE:
389 return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
390
391 case _SC_LEVEL3_CACHE_ASSOC:
392 switch ((edx >> 12) & 0xf)
393 {
394 case 0:
395 case 1:
396 case 2:
397 case 4:
398 return (edx >> 12) & 0xf;
399 case 6:
400 return 8;
401 case 8:
402 return 16;
403 case 10:
404 return 32;
405 case 11:
406 return 48;
407 case 12:
408 return 64;
409 case 13:
410 return 96;
411 case 14:
412 return 128;
413 case 15:
414 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
415 default:
416 return 0;
417 }
418 /* NOTREACHED */
419
420 case _SC_LEVEL3_CACHE_LINESIZE:
421 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
422
423 default:
424 assert (! "cannot happen");
425 }
426 return -1;
427 }
428
429
430 static long int __attribute__ ((noinline))
handle_zhaoxin(int name)431 handle_zhaoxin (int name)
432 {
433 unsigned int eax;
434 unsigned int ebx;
435 unsigned int ecx;
436 unsigned int edx;
437
438 int folded_rel_name = (M(name) / 3) * 3;
439
440 unsigned int round = 0;
441 while (1)
442 {
443 __cpuid_count (4, round, eax, ebx, ecx, edx);
444
445 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
446 if (type == null)
447 break;
448
449 unsigned int level = (eax >> 5) & 0x7;
450
451 if ((level == 1 && type == data
452 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
453 || (level == 1 && type == inst
454 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
455 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
456 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
457 {
458 unsigned int offset = M(name) - folded_rel_name;
459
460 if (offset == 0)
461 /* Cache size. */
462 return (((ebx >> 22) + 1)
463 * (((ebx >> 12) & 0x3ff) + 1)
464 * ((ebx & 0xfff) + 1)
465 * (ecx + 1));
466 if (offset == 1)
467 return (ebx >> 22) + 1;
468
469 assert (offset == 2);
470 return (ebx & 0xfff) + 1;
471 }
472
473 ++round;
474 }
475
476 /* Nothing found. */
477 return 0;
478 }
479
480 static void
get_common_cache_info(long int * shared_ptr,unsigned int * threads_ptr,long int core)481 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
482 long int core)
483 {
484 unsigned int eax;
485 unsigned int ebx;
486 unsigned int ecx;
487 unsigned int edx;
488
489 /* Number of logical processors sharing L2 cache. */
490 int threads_l2;
491
492 /* Number of logical processors sharing L3 cache. */
493 int threads_l3;
494
495 const struct cpu_features *cpu_features = __get_cpu_features ();
496 int max_cpuid = cpu_features->basic.max_cpuid;
497 unsigned int family = cpu_features->basic.family;
498 unsigned int model = cpu_features->basic.model;
499 long int shared = *shared_ptr;
500 unsigned int threads = *threads_ptr;
501 bool inclusive_cache = true;
502 bool support_count_mask = true;
503
504 /* Try L3 first. */
505 unsigned int level = 3;
506
507 if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
508 support_count_mask = false;
509
510 if (shared <= 0)
511 {
512 /* Try L2 otherwise. */
513 level = 2;
514 shared = core;
515 threads_l2 = 0;
516 threads_l3 = -1;
517 }
518 else
519 {
520 threads_l2 = 0;
521 threads_l3 = 0;
522 }
523
524 /* A value of 0 for the HTT bit indicates there is only a single
525 logical processor. */
526 if (HAS_CPU_FEATURE (HTT))
527 {
528 /* Figure out the number of logical threads that share the
529 highest cache level. */
530 if (max_cpuid >= 4)
531 {
532 int i = 0;
533
534 /* Query until cache level 2 and 3 are enumerated. */
535 int check = 0x1 | (threads_l3 == 0) << 1;
536 do
537 {
538 __cpuid_count (4, i++, eax, ebx, ecx, edx);
539
540 /* There seems to be a bug in at least some Pentium Ds
541 which sometimes fail to iterate all cache parameters.
542 Do not loop indefinitely here, stop in this case and
543 assume there is no such information. */
544 if (cpu_features->basic.kind == arch_kind_intel
545 && (eax & 0x1f) == 0 )
546 goto intel_bug_no_cache_info;
547
548 switch ((eax >> 5) & 0x7)
549 {
550 default:
551 break;
552 case 2:
553 if ((check & 0x1))
554 {
555 /* Get maximum number of logical processors
556 sharing L2 cache. */
557 threads_l2 = (eax >> 14) & 0x3ff;
558 check &= ~0x1;
559 }
560 break;
561 case 3:
562 if ((check & (0x1 << 1)))
563 {
564 /* Get maximum number of logical processors
565 sharing L3 cache. */
566 threads_l3 = (eax >> 14) & 0x3ff;
567
568 /* Check if L2 and L3 caches are inclusive. */
569 inclusive_cache = (edx & 0x2) != 0;
570 check &= ~(0x1 << 1);
571 }
572 break;
573 }
574 }
575 while (check);
576
577 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
578 numbers of addressable IDs for logical processors sharing
579 the cache, instead of the maximum number of threads
580 sharing the cache. */
581 if (max_cpuid >= 11 && support_count_mask)
582 {
583 /* Find the number of logical processors shipped in
584 one core and apply count mask. */
585 i = 0;
586
587 /* Count SMT only if there is L3 cache. Always count
588 core if there is no L3 cache. */
589 int count = ((threads_l2 > 0 && level == 3)
590 | ((threads_l3 > 0
591 || (threads_l2 > 0 && level == 2)) << 1));
592
593 while (count)
594 {
595 __cpuid_count (11, i++, eax, ebx, ecx, edx);
596
597 int shipped = ebx & 0xff;
598 int type = ecx & 0xff00;
599 if (shipped == 0 || type == 0)
600 break;
601 else if (type == 0x100)
602 {
603 /* Count SMT. */
604 if ((count & 0x1))
605 {
606 int count_mask;
607
608 /* Compute count mask. */
609 asm ("bsr %1, %0"
610 : "=r" (count_mask) : "g" (threads_l2));
611 count_mask = ~(-1 << (count_mask + 1));
612 threads_l2 = (shipped - 1) & count_mask;
613 count &= ~0x1;
614 }
615 }
616 else if (type == 0x200)
617 {
618 /* Count core. */
619 if ((count & (0x1 << 1)))
620 {
621 int count_mask;
622 int threads_core
623 = (level == 2 ? threads_l2 : threads_l3);
624
625 /* Compute count mask. */
626 asm ("bsr %1, %0"
627 : "=r" (count_mask) : "g" (threads_core));
628 count_mask = ~(-1 << (count_mask + 1));
629 threads_core = (shipped - 1) & count_mask;
630 if (level == 2)
631 threads_l2 = threads_core;
632 else
633 threads_l3 = threads_core;
634 count &= ~(0x1 << 1);
635 }
636 }
637 }
638 }
639 if (threads_l2 > 0)
640 threads_l2 += 1;
641 if (threads_l3 > 0)
642 threads_l3 += 1;
643 if (level == 2)
644 {
645 if (threads_l2)
646 {
647 threads = threads_l2;
648 if (cpu_features->basic.kind == arch_kind_intel
649 && threads > 2
650 && family == 6)
651 switch (model)
652 {
653 case 0x37:
654 case 0x4a:
655 case 0x4d:
656 case 0x5a:
657 case 0x5d:
658 /* Silvermont has L2 cache shared by 2 cores. */
659 threads = 2;
660 break;
661 default:
662 break;
663 }
664 }
665 }
666 else if (threads_l3)
667 threads = threads_l3;
668 }
669 else
670 {
671 intel_bug_no_cache_info:
672 /* Assume that all logical threads share the highest cache
673 level. */
674 threads
675 = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
676 & 0xff);
677 }
678
679 /* Cap usage of highest cache level to the number of supported
680 threads. */
681 if (shared > 0 && threads > 0)
682 shared /= threads;
683 }
684
685 /* Account for non-inclusive L2 and L3 caches. */
686 if (!inclusive_cache)
687 {
688 if (threads_l2 > 0)
689 core /= threads_l2;
690 shared += core;
691 }
692
693 *shared_ptr = shared;
694 *threads_ptr = threads;
695 }
696
697 static void
dl_init_cacheinfo(struct cpu_features * cpu_features)698 dl_init_cacheinfo (struct cpu_features *cpu_features)
699 {
700 /* Find out what brand of processor. */
701 unsigned int ebx;
702 unsigned int ecx;
703 unsigned int edx;
704 int max_cpuid_ex;
705 long int data = -1;
706 long int shared = -1;
707 long int core = -1;
708 unsigned int threads = 0;
709 unsigned long int level1_icache_size = -1;
710 unsigned long int level1_icache_linesize = -1;
711 unsigned long int level1_dcache_size = -1;
712 unsigned long int level1_dcache_assoc = -1;
713 unsigned long int level1_dcache_linesize = -1;
714 unsigned long int level2_cache_size = -1;
715 unsigned long int level2_cache_assoc = -1;
716 unsigned long int level2_cache_linesize = -1;
717 unsigned long int level3_cache_size = -1;
718 unsigned long int level3_cache_assoc = -1;
719 unsigned long int level3_cache_linesize = -1;
720 unsigned long int level4_cache_size = -1;
721
722 if (cpu_features->basic.kind == arch_kind_intel)
723 {
724 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
725 core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
726 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
727
728 level1_icache_size
729 = handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
730 level1_icache_linesize
731 = handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
732 level1_dcache_size = data;
733 level1_dcache_assoc
734 = handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
735 level1_dcache_linesize
736 = handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
737 level2_cache_size = core;
738 level2_cache_assoc
739 = handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
740 level2_cache_linesize
741 = handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
742 level3_cache_size = shared;
743 level3_cache_assoc
744 = handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
745 level3_cache_linesize
746 = handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
747 level4_cache_size
748 = handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
749
750 get_common_cache_info (&shared, &threads, core);
751 }
752 else if (cpu_features->basic.kind == arch_kind_zhaoxin)
753 {
754 data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
755 core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
756 shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
757
758 level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
759 level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
760 level1_dcache_size = data;
761 level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
762 level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
763 level2_cache_size = core;
764 level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
765 level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
766 level3_cache_size = shared;
767 level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
768 level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
769
770 get_common_cache_info (&shared, &threads, core);
771 }
772 else if (cpu_features->basic.kind == arch_kind_amd)
773 {
774 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
775 core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
776 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
777
778 level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
779 level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
780 level1_dcache_size = data;
781 level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
782 level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
783 level2_cache_size = core;
784 level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
785 level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
786 level3_cache_size = shared;
787 level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
788 level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
789
790 /* Get maximum extended function. */
791 __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
792
793 if (shared <= 0)
794 /* No shared L3 cache. All we have is the L2 cache. */
795 shared = core;
796 else
797 {
798 /* Figure out the number of logical threads that share L3. */
799 if (max_cpuid_ex >= 0x80000008)
800 {
801 /* Get width of APIC ID. */
802 __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
803 threads = 1 << ((ecx >> 12) & 0x0f);
804 }
805
806 if (threads == 0 || cpu_features->basic.family >= 0x17)
807 {
808 /* If APIC ID width is not available, use logical
809 processor count. */
810 __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
811
812 if ((edx & (1 << 28)) != 0)
813 threads = (ebx >> 16) & 0xff;
814 }
815
816 /* Cap usage of highest cache level to the number of
817 supported threads. */
818 if (threads > 0)
819 shared /= threads;
820
821 /* Get shared cache per ccx for Zen architectures. */
822 if (cpu_features->basic.family >= 0x17)
823 {
824 unsigned int eax;
825
826 /* Get number of threads share the L3 cache in CCX. */
827 __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
828
829 unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
830 shared *= threads_per_ccx;
831 }
832 else
833 {
834 /* Account for exclusive L2 and L3 caches. */
835 shared += core;
836 }
837 }
838 }
839
840 cpu_features->level1_icache_size = level1_icache_size;
841 cpu_features->level1_icache_linesize = level1_icache_linesize;
842 cpu_features->level1_dcache_size = level1_dcache_size;
843 cpu_features->level1_dcache_assoc = level1_dcache_assoc;
844 cpu_features->level1_dcache_linesize = level1_dcache_linesize;
845 cpu_features->level2_cache_size = level2_cache_size;
846 cpu_features->level2_cache_assoc = level2_cache_assoc;
847 cpu_features->level2_cache_linesize = level2_cache_linesize;
848 cpu_features->level3_cache_size = level3_cache_size;
849 cpu_features->level3_cache_assoc = level3_cache_assoc;
850 cpu_features->level3_cache_linesize = level3_cache_linesize;
851 cpu_features->level4_cache_size = level4_cache_size;
852
853 /* The default setting for the non_temporal threshold is 3/4 of one
854 thread's share of the chip's cache. For most Intel and AMD processors
855 with an initial release date between 2017 and 2020, a thread's typical
856 share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
857 threshold leaves 125 KBytes to 500 KBytes of the thread's data
858 in cache after a maximum temporal copy, which will maintain
859 in cache a reasonable portion of the thread's stack and other
860 active data. If the threshold is set higher than one thread's
861 share of the cache, it has a substantial risk of negatively
862 impacting the performance of other threads running on the chip. */
863 unsigned long int non_temporal_threshold = shared * 3 / 4;
864
865 #if HAVE_TUNABLES
866 /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
867 unsigned int minimum_rep_movsb_threshold;
868 #endif
869 /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
870 VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
871 threshold is 2048 * (VEC_SIZE / 16). */
872 unsigned int rep_movsb_threshold;
873 if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
874 && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
875 {
876 rep_movsb_threshold = 4096 * (64 / 16);
877 #if HAVE_TUNABLES
878 minimum_rep_movsb_threshold = 64 * 8;
879 #endif
880 }
881 else if (CPU_FEATURE_PREFERRED_P (cpu_features,
882 AVX_Fast_Unaligned_Load))
883 {
884 rep_movsb_threshold = 4096 * (32 / 16);
885 #if HAVE_TUNABLES
886 minimum_rep_movsb_threshold = 32 * 8;
887 #endif
888 }
889 else
890 {
891 rep_movsb_threshold = 2048 * (16 / 16);
892 #if HAVE_TUNABLES
893 minimum_rep_movsb_threshold = 16 * 8;
894 #endif
895 }
896 /* NB: The default REP MOVSB threshold is 2112 on processors with fast
897 short REP MOVSB (FSRM). */
898 if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
899 rep_movsb_threshold = 2112;
900
901 /* The default threshold to use Enhanced REP STOSB. */
902 unsigned long int rep_stosb_threshold = 2048;
903
904 #if HAVE_TUNABLES
905 long int tunable_size;
906
907 tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
908 /* NB: Ignore the default value 0. */
909 if (tunable_size != 0)
910 data = tunable_size;
911
912 tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
913 /* NB: Ignore the default value 0. */
914 if (tunable_size != 0)
915 shared = tunable_size;
916
917 tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
918 /* NB: Ignore the default value 0. */
919 if (tunable_size != 0)
920 non_temporal_threshold = tunable_size;
921
922 tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
923 if (tunable_size > minimum_rep_movsb_threshold)
924 rep_movsb_threshold = tunable_size;
925
926 /* NB: The default value of the x86_rep_stosb_threshold tunable is the
927 same as the default value of __x86_rep_stosb_threshold and the
928 minimum value is fixed. */
929 rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
930 long int, NULL);
931
932 TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
933 TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
934 /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
935 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
936 if that operation cannot overflow. Minimum of 0x4040 (16448) because the
937 L(large_memset_4x) loops need 64-byte to cache align and enough space for
938 at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
939 reflected in the manual. */
940 TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
941 0x4040, SIZE_MAX >> 4);
942 TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
943 minimum_rep_movsb_threshold, SIZE_MAX);
944 TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
945 SIZE_MAX);
946 #endif
947
948 unsigned long int rep_movsb_stop_threshold;
949 /* ERMS feature is implemented from AMD Zen3 architecture and it is
950 performing poorly for data above L2 cache size. Henceforth, adding
951 an upper bound threshold parameter to limit the usage of Enhanced
952 REP MOVSB operations and setting its value to L2 cache size. */
953 if (cpu_features->basic.kind == arch_kind_amd)
954 rep_movsb_stop_threshold = core;
955 /* Setting the upper bound of ERMS to the computed value of
956 non-temporal threshold for architectures other than AMD. */
957 else
958 rep_movsb_stop_threshold = non_temporal_threshold;
959
960 cpu_features->data_cache_size = data;
961 cpu_features->shared_cache_size = shared;
962 cpu_features->non_temporal_threshold = non_temporal_threshold;
963 cpu_features->rep_movsb_threshold = rep_movsb_threshold;
964 cpu_features->rep_stosb_threshold = rep_stosb_threshold;
965 cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
966 }
967