1 /* Copyright (C) 2002-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    The GNU C Library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License as published by the Free Software Foundation; either
7    version 2.1 of the License, or (at your option) any later version.
8 
9    The GNU C Library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Lesser General Public License for more details.
13 
14    You should have received a copy of the GNU Lesser General Public
15    License along with the GNU C Library; if not, see
16    <https://www.gnu.org/licenses/>.  */
17 
18 #include <assert.h>
19 #include <errno.h>
20 #include <signal.h>
21 #include <stdint.h>
22 #include <string.h>
23 #include <unistd.h>
24 #include <sys/mman.h>
25 #include <sys/param.h>
26 #include <dl-sysdep.h>
27 #include <dl-tls.h>
28 #include <tls.h>
29 #include <list.h>
30 #include <lowlevellock.h>
31 #include <futex-internal.h>
32 #include <kernel-features.h>
33 #include <nptl-stack.h>
34 #include <libc-lock.h>
35 #include <tls-internal.h>
36 
37 /* Default alignment of stack.  */
38 #ifndef STACK_ALIGN
39 # define STACK_ALIGN __alignof__ (long double)
40 #endif
41 
42 /* Default value for minimal stack size after allocating thread
43    descriptor and guard.  */
44 #ifndef MINIMAL_REST_STACK
45 # define MINIMAL_REST_STACK	4096
46 #endif
47 
48 
49 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
50    a stack.  Use it when possible.  */
51 #ifndef MAP_STACK
52 # define MAP_STACK 0
53 #endif
54 
55 /* Get a stack frame from the cache.  We have to match by size since
56    some blocks might be too small or far too large.  */
57 static struct pthread *
get_cached_stack(size_t * sizep,void ** memp)58 get_cached_stack (size_t *sizep, void **memp)
59 {
60   size_t size = *sizep;
61   struct pthread *result = NULL;
62   list_t *entry;
63 
64   lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
65 
66   /* Search the cache for a matching entry.  We search for the
67      smallest stack which has at least the required size.  Note that
68      in normal situations the size of all allocated stacks is the
69      same.  As the very least there are only a few different sizes.
70      Therefore this loop will exit early most of the time with an
71      exact match.  */
72   list_for_each (entry, &GL (dl_stack_cache))
73     {
74       struct pthread *curr;
75 
76       curr = list_entry (entry, struct pthread, list);
77       if (__nptl_stack_in_use (curr) && curr->stackblock_size >= size)
78 	{
79 	  if (curr->stackblock_size == size)
80 	    {
81 	      result = curr;
82 	      break;
83 	    }
84 
85 	  if (result == NULL
86 	      || result->stackblock_size > curr->stackblock_size)
87 	    result = curr;
88 	}
89     }
90 
91   if (__builtin_expect (result == NULL, 0)
92       /* Make sure the size difference is not too excessive.  In that
93 	 case we do not use the block.  */
94       || __builtin_expect (result->stackblock_size > 4 * size, 0))
95     {
96       /* Release the lock.  */
97       lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
98 
99       return NULL;
100     }
101 
102   /* Don't allow setxid until cloned.  */
103   result->setxid_futex = -1;
104 
105   /* Dequeue the entry.  */
106   __nptl_stack_list_del (&result->list);
107 
108   /* And add to the list of stacks in use.  */
109   __nptl_stack_list_add (&result->list, &GL (dl_stack_used));
110 
111   /* And decrease the cache size.  */
112   GL (dl_stack_cache_actsize) -= result->stackblock_size;
113 
114   /* Release the lock early.  */
115   lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
116 
117   /* Report size and location of the stack to the caller.  */
118   *sizep = result->stackblock_size;
119   *memp = result->stackblock;
120 
121   /* Cancellation handling is back to the default.  */
122   result->cancelhandling = 0;
123   result->cleanup = NULL;
124   result->setup_failed = 0;
125 
126   /* No pending event.  */
127   result->nextevent = NULL;
128 
129   result->exiting = false;
130   __libc_lock_init (result->exit_lock);
131   memset (&result->tls_state, 0, sizeof result->tls_state);
132 
133   /* Clear the DTV.  */
134   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
135   for (size_t cnt = 0; cnt < dtv[-1].counter; ++cnt)
136     free (dtv[1 + cnt].pointer.to_free);
137   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
138 
139   /* Re-initialize the TLS.  */
140   _dl_allocate_tls_init (TLS_TPADJ (result), true);
141 
142   return result;
143 }
144 
145 /* Return the guard page position on allocated stack.  */
146 static inline char *
147 __attribute ((always_inline))
guard_position(void * mem,size_t size,size_t guardsize,struct pthread * pd,size_t pagesize_m1)148 guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
149 		size_t pagesize_m1)
150 {
151 #ifdef NEED_SEPARATE_REGISTER_STACK
152   return mem + (((size - guardsize) / 2) & ~pagesize_m1);
153 #elif _STACK_GROWS_DOWN
154   return mem;
155 #elif _STACK_GROWS_UP
156   return (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
157 #endif
158 }
159 
160 /* Based on stack allocated with PROT_NONE, setup the required portions with
161    'prot' flags based on the guard page position.  */
162 static inline int
setup_stack_prot(char * mem,size_t size,char * guard,size_t guardsize,const int prot)163 setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
164 		  const int prot)
165 {
166   char *guardend = guard + guardsize;
167 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
168   /* As defined at guard_position, for architectures with downward stack
169      the guard page is always at start of the allocated area.  */
170   if (__mprotect (guardend, size - guardsize, prot) != 0)
171     return errno;
172 #else
173   size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
174   if (__mprotect (mem, mprots1, prot) != 0)
175     return errno;
176   size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
177   if (__mprotect (guardend, mprots2, prot) != 0)
178     return errno;
179 #endif
180   return 0;
181 }
182 
183 /* Mark the memory of the stack as usable to the kernel.  It frees everything
184    except for the space used for the TCB itself.  */
185 static __always_inline void
advise_stack_range(void * mem,size_t size,uintptr_t pd,size_t guardsize)186 advise_stack_range (void *mem, size_t size, uintptr_t pd, size_t guardsize)
187 {
188   uintptr_t sp = (uintptr_t) CURRENT_STACK_FRAME;
189   size_t pagesize_m1 = __getpagesize () - 1;
190 #if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
191   size_t freesize = (sp - (uintptr_t) mem) & ~pagesize_m1;
192   assert (freesize < size);
193   if (freesize > PTHREAD_STACK_MIN)
194     __madvise (mem, freesize - PTHREAD_STACK_MIN, MADV_DONTNEED);
195 #else
196   /* Page aligned start of memory to free (higher than or equal
197      to current sp plus the minimum stack size).  */
198   uintptr_t freeblock = (sp + PTHREAD_STACK_MIN + pagesize_m1) & ~pagesize_m1;
199   uintptr_t free_end = (pd - guardsize) & ~pagesize_m1;
200   if (free_end > freeblock)
201     {
202       size_t freesize = free_end - freeblock;
203       assert (freesize < size);
204       __madvise ((void*) freeblock, freesize, MADV_DONTNEED);
205     }
206 #endif
207 }
208 
209 /* Returns a usable stack for a new thread either by allocating a
210    new stack or reusing a cached stack of sufficient size.
211    ATTR must be non-NULL and point to a valid pthread_attr.
212    PDP must be non-NULL.  */
213 static int
allocate_stack(const struct pthread_attr * attr,struct pthread ** pdp,void ** stack,size_t * stacksize)214 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
215 		void **stack, size_t *stacksize)
216 {
217   struct pthread *pd;
218   size_t size;
219   size_t pagesize_m1 = __getpagesize () - 1;
220   size_t tls_static_size_for_stack = __nptl_tls_static_size_for_stack ();
221   size_t tls_static_align_m1 = GLRO (dl_tls_static_align) - 1;
222 
223   assert (powerof2 (pagesize_m1 + 1));
224   assert (TCB_ALIGNMENT >= STACK_ALIGN);
225 
226   /* Get the stack size from the attribute if it is set.  Otherwise we
227      use the default we determined at start time.  */
228   if (attr->stacksize != 0)
229     size = attr->stacksize;
230   else
231     {
232       lll_lock (__default_pthread_attr_lock, LLL_PRIVATE);
233       size = __default_pthread_attr.internal.stacksize;
234       lll_unlock (__default_pthread_attr_lock, LLL_PRIVATE);
235     }
236 
237   /* Get memory for the stack.  */
238   if (__glibc_unlikely (attr->flags & ATTR_FLAG_STACKADDR))
239     {
240       uintptr_t adj;
241       char *stackaddr = (char *) attr->stackaddr;
242 
243       /* Assume the same layout as the _STACK_GROWS_DOWN case, with struct
244 	 pthread at the top of the stack block.  Later we adjust the guard
245 	 location and stack address to match the _STACK_GROWS_UP case.  */
246       if (_STACK_GROWS_UP)
247 	stackaddr += attr->stacksize;
248 
249       /* If the user also specified the size of the stack make sure it
250 	 is large enough.  */
251       if (attr->stacksize != 0
252 	  && attr->stacksize < (tls_static_size_for_stack
253 				+ MINIMAL_REST_STACK))
254 	return EINVAL;
255 
256       /* Adjust stack size for alignment of the TLS block.  */
257 #if TLS_TCB_AT_TP
258       adj = ((uintptr_t) stackaddr - TLS_TCB_SIZE)
259 	    & tls_static_align_m1;
260       assert (size > adj + TLS_TCB_SIZE);
261 #elif TLS_DTV_AT_TP
262       adj = ((uintptr_t) stackaddr - tls_static_size_for_stack)
263 	    & tls_static_align_m1;
264       assert (size > adj);
265 #endif
266 
267       /* The user provided some memory.  Let's hope it matches the
268 	 size...  We do not allocate guard pages if the user provided
269 	 the stack.  It is the user's responsibility to do this if it
270 	 is wanted.  */
271 #if TLS_TCB_AT_TP
272       pd = (struct pthread *) ((uintptr_t) stackaddr
273 			       - TLS_TCB_SIZE - adj);
274 #elif TLS_DTV_AT_TP
275       pd = (struct pthread *) (((uintptr_t) stackaddr
276 				- tls_static_size_for_stack - adj)
277 			       - TLS_PRE_TCB_SIZE);
278 #endif
279 
280       /* The user provided stack memory needs to be cleared.  */
281       memset (pd, '\0', sizeof (struct pthread));
282 
283       /* The first TSD block is included in the TCB.  */
284       pd->specific[0] = pd->specific_1stblock;
285 
286       /* Remember the stack-related values.  */
287       pd->stackblock = (char *) stackaddr - size;
288       pd->stackblock_size = size;
289 
290       /* This is a user-provided stack.  It will not be queued in the
291 	 stack cache nor will the memory (except the TLS memory) be freed.  */
292       pd->user_stack = true;
293 
294       /* This is at least the second thread.  */
295       pd->header.multiple_threads = 1;
296 
297 #ifdef NEED_DL_SYSINFO
298       SETUP_THREAD_SYSINFO (pd);
299 #endif
300 
301       /* Don't allow setxid until cloned.  */
302       pd->setxid_futex = -1;
303 
304       /* Allocate the DTV for this thread.  */
305       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
306 	{
307 	  /* Something went wrong.  */
308 	  assert (errno == ENOMEM);
309 	  return errno;
310 	}
311 
312 
313       /* Prepare to modify global data.  */
314       lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
315 
316       /* And add to the list of stacks in use.  */
317       list_add (&pd->list, &GL (dl_stack_user));
318 
319       lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
320     }
321   else
322     {
323       /* Allocate some anonymous memory.  If possible use the cache.  */
324       size_t guardsize;
325       size_t reported_guardsize;
326       size_t reqsize;
327       void *mem;
328       const int prot = (PROT_READ | PROT_WRITE
329 			| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
330 
331       /* Adjust the stack size for alignment.  */
332       size &= ~tls_static_align_m1;
333       assert (size != 0);
334 
335       /* Make sure the size of the stack is enough for the guard and
336 	 eventually the thread descriptor.  On some targets there is
337 	 a minimum guard size requirement, ARCH_MIN_GUARD_SIZE, so
338 	 internally enforce it (unless the guard was disabled), but
339 	 report the original guard size for backward compatibility:
340 	 before POSIX 2008 the guardsize was specified to be one page
341 	 by default which is observable via pthread_attr_getguardsize
342 	 and pthread_getattr_np.  */
343       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
344       reported_guardsize = guardsize;
345       if (guardsize > 0 && guardsize < ARCH_MIN_GUARD_SIZE)
346 	guardsize = ARCH_MIN_GUARD_SIZE;
347       if (guardsize < attr->guardsize || size + guardsize < guardsize)
348 	/* Arithmetic overflow.  */
349 	return EINVAL;
350       size += guardsize;
351       if (__builtin_expect (size < ((guardsize + tls_static_size_for_stack
352 				     + MINIMAL_REST_STACK + pagesize_m1)
353 				    & ~pagesize_m1),
354 			    0))
355 	/* The stack is too small (or the guard too large).  */
356 	return EINVAL;
357 
358       /* Try to get a stack from the cache.  */
359       reqsize = size;
360       pd = get_cached_stack (&size, &mem);
361       if (pd == NULL)
362 	{
363 	  /* If a guard page is required, avoid committing memory by first
364 	     allocate with PROT_NONE and then reserve with required permission
365 	     excluding the guard page.  */
366 	  mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
367 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
368 
369 	  if (__glibc_unlikely (mem == MAP_FAILED))
370 	    return errno;
371 
372 	  /* SIZE is guaranteed to be greater than zero.
373 	     So we can never get a null pointer back from mmap.  */
374 	  assert (mem != NULL);
375 
376 	  /* Place the thread descriptor at the end of the stack.  */
377 #if TLS_TCB_AT_TP
378 	  pd = (struct pthread *) ((((uintptr_t) mem + size)
379 				    - TLS_TCB_SIZE)
380 				   & ~tls_static_align_m1);
381 #elif TLS_DTV_AT_TP
382 	  pd = (struct pthread *) ((((uintptr_t) mem + size
383 				    - tls_static_size_for_stack)
384 				    & ~tls_static_align_m1)
385 				   - TLS_PRE_TCB_SIZE);
386 #endif
387 
388 	  /* Now mprotect the required region excluding the guard area.  */
389 	  if (__glibc_likely (guardsize > 0))
390 	    {
391 	      char *guard = guard_position (mem, size, guardsize, pd,
392 					    pagesize_m1);
393 	      if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
394 		{
395 		  __munmap (mem, size);
396 		  return errno;
397 		}
398 	    }
399 
400 	  /* Remember the stack-related values.  */
401 	  pd->stackblock = mem;
402 	  pd->stackblock_size = size;
403 	  /* Update guardsize for newly allocated guardsize to avoid
404 	     an mprotect in guard resize below.  */
405 	  pd->guardsize = guardsize;
406 
407 	  /* We allocated the first block thread-specific data array.
408 	     This address will not change for the lifetime of this
409 	     descriptor.  */
410 	  pd->specific[0] = pd->specific_1stblock;
411 
412 	  /* This is at least the second thread.  */
413 	  pd->header.multiple_threads = 1;
414 
415 #ifdef NEED_DL_SYSINFO
416 	  SETUP_THREAD_SYSINFO (pd);
417 #endif
418 
419 	  /* Don't allow setxid until cloned.  */
420 	  pd->setxid_futex = -1;
421 
422 	  /* Allocate the DTV for this thread.  */
423 	  if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
424 	    {
425 	      /* Something went wrong.  */
426 	      assert (errno == ENOMEM);
427 
428 	      /* Free the stack memory we just allocated.  */
429 	      (void) __munmap (mem, size);
430 
431 	      return errno;
432 	    }
433 
434 
435 	  /* Prepare to modify global data.  */
436 	  lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
437 
438 	  /* And add to the list of stacks in use.  */
439 	  __nptl_stack_list_add (&pd->list, &GL (dl_stack_used));
440 
441 	  lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
442 
443 
444 	  /* There might have been a race.  Another thread might have
445 	     caused the stacks to get exec permission while this new
446 	     stack was prepared.  Detect if this was possible and
447 	     change the permission if necessary.  */
448 	  if (__builtin_expect ((GL(dl_stack_flags) & PF_X) != 0
449 				&& (prot & PROT_EXEC) == 0, 0))
450 	    {
451 	      int err = __nptl_change_stack_perm (pd);
452 	      if (err != 0)
453 		{
454 		  /* Free the stack memory we just allocated.  */
455 		  (void) __munmap (mem, size);
456 
457 		  return err;
458 		}
459 	    }
460 
461 
462 	  /* Note that all of the stack and the thread descriptor is
463 	     zeroed.  This means we do not have to initialize fields
464 	     with initial value zero.  This is specifically true for
465 	     the 'tid' field which is always set back to zero once the
466 	     stack is not used anymore and for the 'guardsize' field
467 	     which will be read next.  */
468 	}
469 
470       /* Create or resize the guard area if necessary.  */
471       if (__glibc_unlikely (guardsize > pd->guardsize))
472 	{
473 	  char *guard = guard_position (mem, size, guardsize, pd,
474 					pagesize_m1);
475 	  if (__mprotect (guard, guardsize, PROT_NONE) != 0)
476 	    {
477 	    mprot_error:
478 	      lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
479 
480 	      /* Remove the thread from the list.  */
481 	      __nptl_stack_list_del (&pd->list);
482 
483 	      lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
484 
485 	      /* Get rid of the TLS block we allocated.  */
486 	      _dl_deallocate_tls (TLS_TPADJ (pd), false);
487 
488 	      /* Free the stack memory regardless of whether the size
489 		 of the cache is over the limit or not.  If this piece
490 		 of memory caused problems we better do not use it
491 		 anymore.  Uh, and we ignore possible errors.  There
492 		 is nothing we could do.  */
493 	      (void) __munmap (mem, size);
494 
495 	      return errno;
496 	    }
497 
498 	  pd->guardsize = guardsize;
499 	}
500       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
501 				 0))
502 	{
503 	  /* The old guard area is too large.  */
504 
505 #ifdef NEED_SEPARATE_REGISTER_STACK
506 	  char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
507 	  char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
508 
509 	  if (oldguard < guard
510 	      && __mprotect (oldguard, guard - oldguard, prot) != 0)
511 	    goto mprot_error;
512 
513 	  if (__mprotect (guard + guardsize,
514 			oldguard + pd->guardsize - guard - guardsize,
515 			prot) != 0)
516 	    goto mprot_error;
517 #elif _STACK_GROWS_DOWN
518 	  if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
519 			prot) != 0)
520 	    goto mprot_error;
521 #elif _STACK_GROWS_UP
522          char *new_guard = (char *)(((uintptr_t) pd - guardsize)
523                                     & ~pagesize_m1);
524          char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
525                                     & ~pagesize_m1);
526          /* The guard size difference might be > 0, but once rounded
527             to the nearest page the size difference might be zero.  */
528          if (new_guard > old_guard
529              && __mprotect (old_guard, new_guard - old_guard, prot) != 0)
530 	    goto mprot_error;
531 #endif
532 
533 	  pd->guardsize = guardsize;
534 	}
535       /* The pthread_getattr_np() calls need to get passed the size
536 	 requested in the attribute, regardless of how large the
537 	 actually used guardsize is.  */
538       pd->reported_guardsize = reported_guardsize;
539     }
540 
541   /* Initialize the lock.  We have to do this unconditionally since the
542      stillborn thread could be canceled while the lock is taken.  */
543   pd->lock = LLL_LOCK_INITIALIZER;
544 
545   /* The robust mutex lists also need to be initialized
546      unconditionally because the cleanup for the previous stack owner
547      might have happened in the kernel.  */
548   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
549 				  - offsetof (pthread_mutex_t,
550 					      __data.__list.__next));
551   pd->robust_head.list_op_pending = NULL;
552 #if __PTHREAD_MUTEX_HAVE_PREV
553   pd->robust_prev = &pd->robust_head;
554 #endif
555   pd->robust_head.list = &pd->robust_head;
556 
557   /* We place the thread descriptor at the end of the stack.  */
558   *pdp = pd;
559 
560   void *stacktop;
561 
562 #if TLS_TCB_AT_TP
563   /* The stack begins before the TCB and the static TLS block.  */
564   stacktop = ((char *) (pd + 1) - tls_static_size_for_stack);
565 #elif TLS_DTV_AT_TP
566   stacktop = (char *) (pd - 1);
567 #endif
568 
569   *stacksize = stacktop - pd->stackblock;
570   *stack = pd->stackblock;
571 
572   return 0;
573 }
574