1 /* Initialize x86 cache info.
2    Copyright (C) 2020-2022 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4 
5    The GNU C Library is free software; you can redistribute it and/or
6    modify it under the terms of the GNU Lesser General Public
7    License as published by the Free Software Foundation; either
8    version 2.1 of the License, or (at your option) any later version.
9 
10    The GNU C Library is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13    Lesser General Public License for more details.
14 
15    You should have received a copy of the GNU Lesser General Public
16    License along with the GNU C Library; if not, see
17    <https://www.gnu.org/licenses/>.  */
18 
19 static const struct intel_02_cache_info
20 {
21   unsigned char idx;
22   unsigned char assoc;
23   unsigned char linesize;
24   unsigned char rel_name;
25   unsigned int size;
26 } intel_02_known [] =
27   {
28 #define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
29     { 0x06,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),    8192 },
30     { 0x08,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   16384 },
31     { 0x09,  4, 32, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
32     { 0x0a,  2, 32, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
33     { 0x0c,  4, 32, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
34     { 0x0d,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
35     { 0x0e,  6, 64, M(_SC_LEVEL1_DCACHE_SIZE),   24576 },
36     { 0x21,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
37     { 0x22,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
38     { 0x23,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
39     { 0x25,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
40     { 0x29,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
41     { 0x2c,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
42     { 0x30,  8, 64, M(_SC_LEVEL1_ICACHE_SIZE),   32768 },
43     { 0x39,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
44     { 0x3a,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   196608 },
45     { 0x3b,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
46     { 0x3c,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
47     { 0x3d,  6, 64, M(_SC_LEVEL2_CACHE_SIZE),   393216 },
48     { 0x3e,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
49     { 0x3f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
50     { 0x41,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
51     { 0x42,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
52     { 0x43,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
53     { 0x44,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
54     { 0x45,  4, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
55     { 0x46,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
56     { 0x47,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
57     { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE),  3145728 },
58     { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE),  4194304 },
59     { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  6291456 },
60     { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
61     { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
62     { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
63     { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE),  6291456 },
64     { 0x60,  8, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
65     { 0x66,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),    8192 },
66     { 0x67,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   16384 },
67     { 0x68,  4, 64, M(_SC_LEVEL1_DCACHE_SIZE),   32768 },
68     { 0x78,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
69     { 0x79,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   131072 },
70     { 0x7a,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
71     { 0x7b,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
72     { 0x7c,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
73     { 0x7d,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
74     { 0x7f,  2, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
75     { 0x80,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
76     { 0x82,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   262144 },
77     { 0x83,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
78     { 0x84,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
79     { 0x85,  8, 32, M(_SC_LEVEL2_CACHE_SIZE),  2097152 },
80     { 0x86,  4, 64, M(_SC_LEVEL2_CACHE_SIZE),   524288 },
81     { 0x87,  8, 64, M(_SC_LEVEL2_CACHE_SIZE),  1048576 },
82     { 0xd0,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),   524288 },
83     { 0xd1,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
84     { 0xd2,  4, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
85     { 0xd6,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  1048576 },
86     { 0xd7,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
87     { 0xd8,  8, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
88     { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
89     { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
90     { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
91     { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  2097152 },
92     { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  4194304 },
93     { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE),  8388608 },
94     { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
95     { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
96     { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
97   };
98 
99 #define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
100 
101 static int
intel_02_known_compare(const void * p1,const void * p2)102 intel_02_known_compare (const void *p1, const void *p2)
103 {
104   const struct intel_02_cache_info *i1;
105   const struct intel_02_cache_info *i2;
106 
107   i1 = (const struct intel_02_cache_info *) p1;
108   i2 = (const struct intel_02_cache_info *) p2;
109 
110   if (i1->idx == i2->idx)
111     return 0;
112 
113   return i1->idx < i2->idx ? -1 : 1;
114 }
115 
116 
117 static long int
118 __attribute__ ((noinline))
intel_check_word(int name,unsigned int value,bool * has_level_2,bool * no_level_2_or_3,const struct cpu_features * cpu_features)119 intel_check_word (int name, unsigned int value, bool *has_level_2,
120 		  bool *no_level_2_or_3,
121 		  const struct cpu_features *cpu_features)
122 {
123   if ((value & 0x80000000) != 0)
124     /* The register value is reserved.  */
125     return 0;
126 
127   /* Fold the name.  The _SC_ constants are always in the order SIZE,
128      ASSOC, LINESIZE.  */
129   int folded_rel_name = (M(name) / 3) * 3;
130 
131   while (value != 0)
132     {
133       unsigned int byte = value & 0xff;
134 
135       if (byte == 0x40)
136 	{
137 	  *no_level_2_or_3 = true;
138 
139 	  if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
140 	    /* No need to look further.  */
141 	    break;
142 	}
143       else if (byte == 0xff)
144 	{
145 	  /* CPUID leaf 0x4 contains all the information.  We need to
146 	     iterate over it.  */
147 	  unsigned int eax;
148 	  unsigned int ebx;
149 	  unsigned int ecx;
150 	  unsigned int edx;
151 
152 	  unsigned int round = 0;
153 	  while (1)
154 	    {
155 	      __cpuid_count (4, round, eax, ebx, ecx, edx);
156 
157 	      enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
158 	      if (type == null)
159 		/* That was the end.  */
160 		break;
161 
162 	      unsigned int level = (eax >> 5) & 0x7;
163 
164 	      if ((level == 1 && type == data
165 		   && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
166 		  || (level == 1 && type == inst
167 		      && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
168 		  || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
169 		  || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
170 		  || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
171 		{
172 		  unsigned int offset = M(name) - folded_rel_name;
173 
174 		  if (offset == 0)
175 		    /* Cache size.  */
176 		    return (((ebx >> 22) + 1)
177 			    * (((ebx >> 12) & 0x3ff) + 1)
178 			    * ((ebx & 0xfff) + 1)
179 			    * (ecx + 1));
180 		  if (offset == 1)
181 		    return (ebx >> 22) + 1;
182 
183 		  assert (offset == 2);
184 		  return (ebx & 0xfff) + 1;
185 		}
186 
187 	      ++round;
188 	    }
189 	  /* There is no other cache information anywhere else.  */
190 	  break;
191 	}
192       else
193 	{
194 	  if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
195 	    {
196 	      /* Intel reused this value.  For family 15, model 6 it
197 		 specifies the 3rd level cache.  Otherwise the 2nd
198 		 level cache.  */
199 	      unsigned int family = cpu_features->basic.family;
200 	      unsigned int model = cpu_features->basic.model;
201 
202 	      if (family == 15 && model == 6)
203 		{
204 		  /* The level 3 cache is encoded for this model like
205 		     the level 2 cache is for other models.  Pretend
206 		     the caller asked for the level 2 cache.  */
207 		  name = (_SC_LEVEL2_CACHE_SIZE
208 			  + (name - _SC_LEVEL3_CACHE_SIZE));
209 		  folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
210 		}
211 	    }
212 
213 	  struct intel_02_cache_info *found;
214 	  struct intel_02_cache_info search;
215 
216 	  search.idx = byte;
217 	  found = bsearch (&search, intel_02_known, nintel_02_known,
218 			   sizeof (intel_02_known[0]), intel_02_known_compare);
219 	  if (found != NULL)
220 	    {
221 	      if (found->rel_name == folded_rel_name)
222 		{
223 		  unsigned int offset = M(name) - folded_rel_name;
224 
225 		  if (offset == 0)
226 		    /* Cache size.  */
227 		    return found->size;
228 		  if (offset == 1)
229 		    return found->assoc;
230 
231 		  assert (offset == 2);
232 		  return found->linesize;
233 		}
234 
235 	      if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
236 		*has_level_2 = true;
237 	    }
238 	}
239 
240       /* Next byte for the next round.  */
241       value >>= 8;
242     }
243 
244   /* Nothing found.  */
245   return 0;
246 }
247 
248 
249 static long int __attribute__ ((noinline))
handle_intel(int name,const struct cpu_features * cpu_features)250 handle_intel (int name, const struct cpu_features *cpu_features)
251 {
252   unsigned int maxidx = cpu_features->basic.max_cpuid;
253 
254   /* Return -1 for older CPUs.  */
255   if (maxidx < 2)
256     return -1;
257 
258   /* OK, we can use the CPUID instruction to get all info about the
259      caches.  */
260   unsigned int cnt = 0;
261   unsigned int max = 1;
262   long int result = 0;
263   bool no_level_2_or_3 = false;
264   bool has_level_2 = false;
265 
266   while (cnt++ < max)
267     {
268       unsigned int eax;
269       unsigned int ebx;
270       unsigned int ecx;
271       unsigned int edx;
272       __cpuid (2, eax, ebx, ecx, edx);
273 
274       /* The low byte of EAX in the first round contain the number of
275 	 rounds we have to make.  At least one, the one we are already
276 	 doing.  */
277       if (cnt == 1)
278 	{
279 	  max = eax & 0xff;
280 	  eax &= 0xffffff00;
281 	}
282 
283       /* Process the individual registers' value.  */
284       result = intel_check_word (name, eax, &has_level_2,
285 				 &no_level_2_or_3, cpu_features);
286       if (result != 0)
287 	return result;
288 
289       result = intel_check_word (name, ebx, &has_level_2,
290 				 &no_level_2_or_3, cpu_features);
291       if (result != 0)
292 	return result;
293 
294       result = intel_check_word (name, ecx, &has_level_2,
295 				 &no_level_2_or_3, cpu_features);
296       if (result != 0)
297 	return result;
298 
299       result = intel_check_word (name, edx, &has_level_2,
300 				 &no_level_2_or_3, cpu_features);
301       if (result != 0)
302 	return result;
303     }
304 
305   if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
306       && no_level_2_or_3)
307     return -1;
308 
309   return 0;
310 }
311 
312 
313 static long int __attribute__ ((noinline))
handle_amd(int name)314 handle_amd (int name)
315 {
316   unsigned int eax;
317   unsigned int ebx;
318   unsigned int ecx;
319   unsigned int edx;
320   __cpuid (0x80000000, eax, ebx, ecx, edx);
321 
322   /* No level 4 cache (yet).  */
323   if (name > _SC_LEVEL3_CACHE_LINESIZE)
324     return 0;
325 
326   unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
327   if (eax < fn)
328     return 0;
329 
330   __cpuid (fn, eax, ebx, ecx, edx);
331 
332   if (name < _SC_LEVEL1_DCACHE_SIZE)
333     {
334       name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
335       ecx = edx;
336     }
337 
338   switch (name)
339     {
340     case _SC_LEVEL1_DCACHE_SIZE:
341       return (ecx >> 14) & 0x3fc00;
342 
343     case _SC_LEVEL1_DCACHE_ASSOC:
344       ecx >>= 16;
345       if ((ecx & 0xff) == 0xff)
346 	/* Fully associative.  */
347 	return (ecx << 2) & 0x3fc00;
348       return ecx & 0xff;
349 
350     case _SC_LEVEL1_DCACHE_LINESIZE:
351       return ecx & 0xff;
352 
353     case _SC_LEVEL2_CACHE_SIZE:
354       return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
355 
356     case _SC_LEVEL2_CACHE_ASSOC:
357       switch ((ecx >> 12) & 0xf)
358 	{
359 	case 0:
360 	case 1:
361 	case 2:
362 	case 4:
363 	  return (ecx >> 12) & 0xf;
364 	case 6:
365 	  return 8;
366 	case 8:
367 	  return 16;
368 	case 10:
369 	  return 32;
370 	case 11:
371 	  return 48;
372 	case 12:
373 	  return 64;
374 	case 13:
375 	  return 96;
376 	case 14:
377 	  return 128;
378 	case 15:
379 	  return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
380 	default:
381 	  return 0;
382 	}
383       /* NOTREACHED */
384 
385     case _SC_LEVEL2_CACHE_LINESIZE:
386       return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
387 
388     case _SC_LEVEL3_CACHE_SIZE:
389       return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
390 
391     case _SC_LEVEL3_CACHE_ASSOC:
392       switch ((edx >> 12) & 0xf)
393 	{
394 	case 0:
395 	case 1:
396 	case 2:
397 	case 4:
398 	  return (edx >> 12) & 0xf;
399 	case 6:
400 	  return 8;
401 	case 8:
402 	  return 16;
403 	case 10:
404 	  return 32;
405 	case 11:
406 	  return 48;
407 	case 12:
408 	  return 64;
409 	case 13:
410 	  return 96;
411 	case 14:
412 	  return 128;
413 	case 15:
414 	  return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
415 	default:
416 	  return 0;
417 	}
418       /* NOTREACHED */
419 
420     case _SC_LEVEL3_CACHE_LINESIZE:
421       return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
422 
423     default:
424       assert (! "cannot happen");
425     }
426   return -1;
427 }
428 
429 
430 static long int __attribute__ ((noinline))
handle_zhaoxin(int name)431 handle_zhaoxin (int name)
432 {
433   unsigned int eax;
434   unsigned int ebx;
435   unsigned int ecx;
436   unsigned int edx;
437 
438   int folded_rel_name = (M(name) / 3) * 3;
439 
440   unsigned int round = 0;
441   while (1)
442     {
443       __cpuid_count (4, round, eax, ebx, ecx, edx);
444 
445       enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
446       if (type == null)
447         break;
448 
449       unsigned int level = (eax >> 5) & 0x7;
450 
451       if ((level == 1 && type == data
452         && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
453         || (level == 1 && type == inst
454             && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
455         || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
456         || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE)))
457         {
458           unsigned int offset = M(name) - folded_rel_name;
459 
460           if (offset == 0)
461             /* Cache size.  */
462             return (((ebx >> 22) + 1)
463                 * (((ebx >> 12) & 0x3ff) + 1)
464                 * ((ebx & 0xfff) + 1)
465                 * (ecx + 1));
466           if (offset == 1)
467             return (ebx >> 22) + 1;
468 
469           assert (offset == 2);
470           return (ebx & 0xfff) + 1;
471         }
472 
473       ++round;
474     }
475 
476   /* Nothing found.  */
477   return 0;
478 }
479 
480 static void
get_common_cache_info(long int * shared_ptr,unsigned int * threads_ptr,long int core)481 get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
482                 long int core)
483 {
484   unsigned int eax;
485   unsigned int ebx;
486   unsigned int ecx;
487   unsigned int edx;
488 
489   /* Number of logical processors sharing L2 cache.  */
490   int threads_l2;
491 
492   /* Number of logical processors sharing L3 cache.  */
493   int threads_l3;
494 
495   const struct cpu_features *cpu_features = __get_cpu_features ();
496   int max_cpuid = cpu_features->basic.max_cpuid;
497   unsigned int family = cpu_features->basic.family;
498   unsigned int model = cpu_features->basic.model;
499   long int shared = *shared_ptr;
500   unsigned int threads = *threads_ptr;
501   bool inclusive_cache = true;
502   bool support_count_mask = true;
503 
504   /* Try L3 first.  */
505   unsigned int level = 3;
506 
507   if (cpu_features->basic.kind == arch_kind_zhaoxin && family == 6)
508     support_count_mask = false;
509 
510   if (shared <= 0)
511     {
512       /* Try L2 otherwise.  */
513       level  = 2;
514       shared = core;
515       threads_l2 = 0;
516       threads_l3 = -1;
517     }
518   else
519     {
520       threads_l2 = 0;
521       threads_l3 = 0;
522     }
523 
524   /* A value of 0 for the HTT bit indicates there is only a single
525      logical processor.  */
526   if (HAS_CPU_FEATURE (HTT))
527     {
528       /* Figure out the number of logical threads that share the
529          highest cache level.  */
530       if (max_cpuid >= 4)
531         {
532           int i = 0;
533 
534           /* Query until cache level 2 and 3 are enumerated.  */
535           int check = 0x1 | (threads_l3 == 0) << 1;
536           do
537             {
538               __cpuid_count (4, i++, eax, ebx, ecx, edx);
539 
540               /* There seems to be a bug in at least some Pentium Ds
541                  which sometimes fail to iterate all cache parameters.
542                  Do not loop indefinitely here, stop in this case and
543                  assume there is no such information.  */
544               if (cpu_features->basic.kind == arch_kind_intel
545                   && (eax & 0x1f) == 0 )
546                 goto intel_bug_no_cache_info;
547 
548               switch ((eax >> 5) & 0x7)
549                 {
550                   default:
551                     break;
552                   case 2:
553                     if ((check & 0x1))
554                       {
555                         /* Get maximum number of logical processors
556                            sharing L2 cache.  */
557                         threads_l2 = (eax >> 14) & 0x3ff;
558                         check &= ~0x1;
559                       }
560                     break;
561                   case 3:
562                     if ((check & (0x1 << 1)))
563                       {
564                         /* Get maximum number of logical processors
565                            sharing L3 cache.  */
566                         threads_l3 = (eax >> 14) & 0x3ff;
567 
568                         /* Check if L2 and L3 caches are inclusive.  */
569                         inclusive_cache = (edx & 0x2) != 0;
570                         check &= ~(0x1 << 1);
571                       }
572                     break;
573                 }
574             }
575           while (check);
576 
577           /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
578              numbers of addressable IDs for logical processors sharing
579              the cache, instead of the maximum number of threads
580              sharing the cache.  */
581           if (max_cpuid >= 11 && support_count_mask)
582             {
583               /* Find the number of logical processors shipped in
584                  one core and apply count mask.  */
585               i = 0;
586 
587               /* Count SMT only if there is L3 cache.  Always count
588                  core if there is no L3 cache.  */
589               int count = ((threads_l2 > 0 && level == 3)
590                            | ((threads_l3 > 0
591                                || (threads_l2 > 0 && level == 2)) << 1));
592 
593               while (count)
594                 {
595                   __cpuid_count (11, i++, eax, ebx, ecx, edx);
596 
597                   int shipped = ebx & 0xff;
598                   int type = ecx & 0xff00;
599                   if (shipped == 0 || type == 0)
600                     break;
601                   else if (type == 0x100)
602                     {
603                       /* Count SMT.  */
604                       if ((count & 0x1))
605                         {
606                           int count_mask;
607 
608                           /* Compute count mask.  */
609                           asm ("bsr %1, %0"
610                                : "=r" (count_mask) : "g" (threads_l2));
611                           count_mask = ~(-1 << (count_mask + 1));
612                           threads_l2 = (shipped - 1) & count_mask;
613                           count &= ~0x1;
614                         }
615                     }
616                   else if (type == 0x200)
617                     {
618                       /* Count core.  */
619                       if ((count & (0x1 << 1)))
620                         {
621                           int count_mask;
622                           int threads_core
623                             = (level == 2 ? threads_l2 : threads_l3);
624 
625                           /* Compute count mask.  */
626                           asm ("bsr %1, %0"
627                                : "=r" (count_mask) : "g" (threads_core));
628                           count_mask = ~(-1 << (count_mask + 1));
629                           threads_core = (shipped - 1) & count_mask;
630                           if (level == 2)
631                             threads_l2 = threads_core;
632                           else
633                             threads_l3 = threads_core;
634                           count &= ~(0x1 << 1);
635                         }
636                     }
637                 }
638             }
639           if (threads_l2 > 0)
640             threads_l2 += 1;
641           if (threads_l3 > 0)
642             threads_l3 += 1;
643           if (level == 2)
644             {
645               if (threads_l2)
646                 {
647                   threads = threads_l2;
648                   if (cpu_features->basic.kind == arch_kind_intel
649                       && threads > 2
650                       && family == 6)
651                     switch (model)
652                       {
653                         case 0x37:
654                         case 0x4a:
655                         case 0x4d:
656                         case 0x5a:
657                         case 0x5d:
658                           /* Silvermont has L2 cache shared by 2 cores.  */
659                           threads = 2;
660                           break;
661                         default:
662                           break;
663                       }
664                 }
665             }
666           else if (threads_l3)
667             threads = threads_l3;
668         }
669       else
670         {
671 intel_bug_no_cache_info:
672           /* Assume that all logical threads share the highest cache
673              level.  */
674           threads
675             = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
676 	       & 0xff);
677         }
678 
679         /* Cap usage of highest cache level to the number of supported
680            threads.  */
681         if (shared > 0 && threads > 0)
682           shared /= threads;
683     }
684 
685   /* Account for non-inclusive L2 and L3 caches.  */
686   if (!inclusive_cache)
687     {
688       if (threads_l2 > 0)
689         core /= threads_l2;
690       shared += core;
691     }
692 
693   *shared_ptr = shared;
694   *threads_ptr = threads;
695 }
696 
697 static void
dl_init_cacheinfo(struct cpu_features * cpu_features)698 dl_init_cacheinfo (struct cpu_features *cpu_features)
699 {
700   /* Find out what brand of processor.  */
701   unsigned int ebx;
702   unsigned int ecx;
703   unsigned int edx;
704   int max_cpuid_ex;
705   long int data = -1;
706   long int shared = -1;
707   long int core = -1;
708   unsigned int threads = 0;
709   unsigned long int level1_icache_size = -1;
710   unsigned long int level1_icache_linesize = -1;
711   unsigned long int level1_dcache_size = -1;
712   unsigned long int level1_dcache_assoc = -1;
713   unsigned long int level1_dcache_linesize = -1;
714   unsigned long int level2_cache_size = -1;
715   unsigned long int level2_cache_assoc = -1;
716   unsigned long int level2_cache_linesize = -1;
717   unsigned long int level3_cache_size = -1;
718   unsigned long int level3_cache_assoc = -1;
719   unsigned long int level3_cache_linesize = -1;
720   unsigned long int level4_cache_size = -1;
721 
722   if (cpu_features->basic.kind == arch_kind_intel)
723     {
724       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
725       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
726       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
727 
728       level1_icache_size
729 	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
730       level1_icache_linesize
731 	= handle_intel (_SC_LEVEL1_ICACHE_LINESIZE, cpu_features);
732       level1_dcache_size = data;
733       level1_dcache_assoc
734 	= handle_intel (_SC_LEVEL1_DCACHE_ASSOC, cpu_features);
735       level1_dcache_linesize
736 	= handle_intel (_SC_LEVEL1_DCACHE_LINESIZE, cpu_features);
737       level2_cache_size = core;
738       level2_cache_assoc
739 	= handle_intel (_SC_LEVEL2_CACHE_ASSOC, cpu_features);
740       level2_cache_linesize
741 	= handle_intel (_SC_LEVEL2_CACHE_LINESIZE, cpu_features);
742       level3_cache_size = shared;
743       level3_cache_assoc
744 	= handle_intel (_SC_LEVEL3_CACHE_ASSOC, cpu_features);
745       level3_cache_linesize
746 	= handle_intel (_SC_LEVEL3_CACHE_LINESIZE, cpu_features);
747       level4_cache_size
748 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
749 
750       get_common_cache_info (&shared, &threads, core);
751     }
752   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
753     {
754       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
755       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
756       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
757 
758       level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
759       level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
760       level1_dcache_size = data;
761       level1_dcache_assoc = handle_zhaoxin (_SC_LEVEL1_DCACHE_ASSOC);
762       level1_dcache_linesize = handle_zhaoxin (_SC_LEVEL1_DCACHE_LINESIZE);
763       level2_cache_size = core;
764       level2_cache_assoc = handle_zhaoxin (_SC_LEVEL2_CACHE_ASSOC);
765       level2_cache_linesize = handle_zhaoxin (_SC_LEVEL2_CACHE_LINESIZE);
766       level3_cache_size = shared;
767       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
768       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
769 
770       get_common_cache_info (&shared, &threads, core);
771     }
772   else if (cpu_features->basic.kind == arch_kind_amd)
773     {
774       data  = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
775       core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
776       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
777 
778       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE);
779       level1_icache_linesize = handle_amd (_SC_LEVEL1_ICACHE_LINESIZE);
780       level1_dcache_size = data;
781       level1_dcache_assoc = handle_amd (_SC_LEVEL1_DCACHE_ASSOC);
782       level1_dcache_linesize = handle_amd (_SC_LEVEL1_DCACHE_LINESIZE);
783       level2_cache_size = core;
784       level2_cache_assoc = handle_amd (_SC_LEVEL2_CACHE_ASSOC);
785       level2_cache_linesize = handle_amd (_SC_LEVEL2_CACHE_LINESIZE);
786       level3_cache_size = shared;
787       level3_cache_assoc = handle_amd (_SC_LEVEL3_CACHE_ASSOC);
788       level3_cache_linesize = handle_amd (_SC_LEVEL3_CACHE_LINESIZE);
789 
790       /* Get maximum extended function. */
791       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
792 
793       if (shared <= 0)
794 	/* No shared L3 cache.  All we have is the L2 cache.  */
795 	shared = core;
796       else
797 	{
798 	  /* Figure out the number of logical threads that share L3.  */
799 	  if (max_cpuid_ex >= 0x80000008)
800 	    {
801 	      /* Get width of APIC ID.  */
802 	      __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
803 	      threads = 1 << ((ecx >> 12) & 0x0f);
804 	    }
805 
806 	  if (threads == 0 || cpu_features->basic.family >= 0x17)
807 	    {
808 	      /* If APIC ID width is not available, use logical
809 		 processor count.  */
810 	      __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
811 
812 	      if ((edx & (1 << 28)) != 0)
813 		threads = (ebx >> 16) & 0xff;
814 	    }
815 
816 	  /* Cap usage of highest cache level to the number of
817 	     supported threads.  */
818 	  if (threads > 0)
819 	    shared /= threads;
820 
821 	  /* Get shared cache per ccx for Zen architectures.  */
822 	  if (cpu_features->basic.family >= 0x17)
823 	    {
824 	      unsigned int eax;
825 
826 	      /* Get number of threads share the L3 cache in CCX.  */
827 	      __cpuid_count (0x8000001D, 0x3, eax, ebx, ecx, edx);
828 
829 	      unsigned int threads_per_ccx = ((eax >> 14) & 0xfff) + 1;
830 	      shared *= threads_per_ccx;
831 	    }
832 	  else
833 	    {
834 	      /* Account for exclusive L2 and L3 caches.  */
835 	      shared += core;
836             }
837 	}
838     }
839 
840   cpu_features->level1_icache_size = level1_icache_size;
841   cpu_features->level1_icache_linesize = level1_icache_linesize;
842   cpu_features->level1_dcache_size = level1_dcache_size;
843   cpu_features->level1_dcache_assoc = level1_dcache_assoc;
844   cpu_features->level1_dcache_linesize = level1_dcache_linesize;
845   cpu_features->level2_cache_size = level2_cache_size;
846   cpu_features->level2_cache_assoc = level2_cache_assoc;
847   cpu_features->level2_cache_linesize = level2_cache_linesize;
848   cpu_features->level3_cache_size = level3_cache_size;
849   cpu_features->level3_cache_assoc = level3_cache_assoc;
850   cpu_features->level3_cache_linesize = level3_cache_linesize;
851   cpu_features->level4_cache_size = level4_cache_size;
852 
853   /* The default setting for the non_temporal threshold is 3/4 of one
854      thread's share of the chip's cache. For most Intel and AMD processors
855      with an initial release date between 2017 and 2020, a thread's typical
856      share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
857      threshold leaves 125 KBytes to 500 KBytes of the thread's data
858      in cache after a maximum temporal copy, which will maintain
859      in cache a reasonable portion of the thread's stack and other
860      active data. If the threshold is set higher than one thread's
861      share of the cache, it has a substantial risk of negatively
862      impacting the performance of other threads running on the chip. */
863   unsigned long int non_temporal_threshold = shared * 3 / 4;
864 
865 #if HAVE_TUNABLES
866   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
867   unsigned int minimum_rep_movsb_threshold;
868 #endif
869   /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
870      VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
871      threshold is 2048 * (VEC_SIZE / 16).  */
872   unsigned int rep_movsb_threshold;
873   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
874       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
875     {
876       rep_movsb_threshold = 4096 * (64 / 16);
877 #if HAVE_TUNABLES
878       minimum_rep_movsb_threshold = 64 * 8;
879 #endif
880     }
881   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
882 				    AVX_Fast_Unaligned_Load))
883     {
884       rep_movsb_threshold = 4096 * (32 / 16);
885 #if HAVE_TUNABLES
886       minimum_rep_movsb_threshold = 32 * 8;
887 #endif
888     }
889   else
890     {
891       rep_movsb_threshold = 2048 * (16 / 16);
892 #if HAVE_TUNABLES
893       minimum_rep_movsb_threshold = 16 * 8;
894 #endif
895     }
896   /* NB: The default REP MOVSB threshold is 2112 on processors with fast
897      short REP MOVSB (FSRM).  */
898   if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
899     rep_movsb_threshold = 2112;
900 
901   /* The default threshold to use Enhanced REP STOSB.  */
902   unsigned long int rep_stosb_threshold = 2048;
903 
904 #if HAVE_TUNABLES
905   long int tunable_size;
906 
907   tunable_size = TUNABLE_GET (x86_data_cache_size, long int, NULL);
908   /* NB: Ignore the default value 0.  */
909   if (tunable_size != 0)
910     data = tunable_size;
911 
912   tunable_size = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
913   /* NB: Ignore the default value 0.  */
914   if (tunable_size != 0)
915     shared = tunable_size;
916 
917   tunable_size = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
918   /* NB: Ignore the default value 0.  */
919   if (tunable_size != 0)
920     non_temporal_threshold = tunable_size;
921 
922   tunable_size = TUNABLE_GET (x86_rep_movsb_threshold, long int, NULL);
923   if (tunable_size > minimum_rep_movsb_threshold)
924     rep_movsb_threshold = tunable_size;
925 
926   /* NB: The default value of the x86_rep_stosb_threshold tunable is the
927      same as the default value of __x86_rep_stosb_threshold and the
928      minimum value is fixed.  */
929   rep_stosb_threshold = TUNABLE_GET (x86_rep_stosb_threshold,
930 				     long int, NULL);
931 
932   TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
933   TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
934   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
935      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
936      if that operation cannot overflow. Minimum of 0x4040 (16448) because the
937      L(large_memset_4x) loops need 64-byte to cache align and enough space for
938      at least 1 iteration of 4x PAGE_SIZE unrolled loop.  Both values are
939      reflected in the manual.  */
940   TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
941 			   0x4040, SIZE_MAX >> 4);
942   TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
943 			   minimum_rep_movsb_threshold, SIZE_MAX);
944   TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
945 			   SIZE_MAX);
946 #endif
947 
948   unsigned long int rep_movsb_stop_threshold;
949   /* ERMS feature is implemented from AMD Zen3 architecture and it is
950      performing poorly for data above L2 cache size. Henceforth, adding
951      an upper bound threshold parameter to limit the usage of Enhanced
952      REP MOVSB operations and setting its value to L2 cache size.  */
953   if (cpu_features->basic.kind == arch_kind_amd)
954     rep_movsb_stop_threshold = core;
955   /* Setting the upper bound of ERMS to the computed value of
956      non-temporal threshold for architectures other than AMD.  */
957   else
958     rep_movsb_stop_threshold = non_temporal_threshold;
959 
960   cpu_features->data_cache_size = data;
961   cpu_features->shared_cache_size = shared;
962   cpu_features->non_temporal_threshold = non_temporal_threshold;
963   cpu_features->rep_movsb_threshold = rep_movsb_threshold;
964   cpu_features->rep_stosb_threshold = rep_stosb_threshold;
965   cpu_features->rep_movsb_stop_threshold = rep_movsb_stop_threshold;
966 }
967