1 /* Copyright (C) 2002-2022 Free Software Foundation, Inc.
2    This file is part of the GNU C Library.
3 
4    The GNU C Library is free software; you can redistribute it and/or
5    modify it under the terms of the GNU Lesser General Public
6    License as published by the Free Software Foundation; either
7    version 2.1 of the License, or (at your option) any later version.
8 
9    The GNU C Library is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12    Lesser General Public License for more details.
13 
14    You should have received a copy of the GNU Lesser General Public
15    License along with the GNU C Library; if not, see
16    <https://www.gnu.org/licenses/>.  */
17 
18 #include <ctype.h>
19 #include <errno.h>
20 #include <stdbool.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <stdint.h>
24 #include "pthreadP.h"
25 #include <hp-timing.h>
26 #include <ldsodefs.h>
27 #include <atomic.h>
28 #include <libc-diag.h>
29 #include <libc-internal.h>
30 #include <resolv.h>
31 #include <kernel-features.h>
32 #include <default-sched.h>
33 #include <futex-internal.h>
34 #include <tls-setup.h>
35 #include <rseq-internal.h>
36 #include "libioP.h"
37 #include <sys/single_threaded.h>
38 #include <version.h>
39 #include <clone_internal.h>
40 #include <futex-internal.h>
41 
42 #include <shlib-compat.h>
43 
44 #include <stap-probe.h>
45 
46 
47 /* Globally enabled events.  */
48 td_thr_events_t __nptl_threads_events;
49 libc_hidden_proto (__nptl_threads_events)
50 libc_hidden_data_def (__nptl_threads_events)
51 
52 /* Pointer to descriptor with the last event.  */
53 struct pthread *__nptl_last_event;
54 libc_hidden_proto (__nptl_last_event)
55 libc_hidden_data_def (__nptl_last_event)
56 
57 #ifdef SHARED
58 /* This variable is used to access _rtld_global from libthread_db.  If
59    GDB loads libpthread before ld.so, it is not possible to resolve
60    _rtld_global directly during libpthread initialization.  */
61 struct rtld_global *__nptl_rtld_global = &_rtld_global;
62 #endif
63 
64 /* Version of the library, used in libthread_db to detect mismatches.  */
65 const char __nptl_version[] = VERSION;
66 
67 /* This performs the initialization necessary when going from
68    single-threaded to multi-threaded mode for the first time.  */
69 static void
late_init(void)70 late_init (void)
71 {
72   struct sigaction sa;
73   __sigemptyset (&sa.sa_mask);
74 
75   /* Install the handle to change the threads' uid/gid.  Use
76      SA_ONSTACK because the signal may be sent to threads that are
77      running with custom stacks.  (This is less likely for
78      SIGCANCEL.)  */
79   sa.sa_sigaction = __nptl_setxid_sighandler;
80   sa.sa_flags = SA_ONSTACK | SA_SIGINFO | SA_RESTART;
81   (void) __libc_sigaction (SIGSETXID, &sa, NULL);
82 
83   /* The parent process might have left the signals blocked.  Just in
84      case, unblock it.  We reuse the signal mask in the sigaction
85      structure.  It is already cleared.  */
86   __sigaddset (&sa.sa_mask, SIGCANCEL);
87   __sigaddset (&sa.sa_mask, SIGSETXID);
88   INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_UNBLOCK, &sa.sa_mask,
89 			 NULL, __NSIG_BYTES);
90 }
91 
92 /* Code to allocate and deallocate a stack.  */
93 #include "allocatestack.c"
94 
95 /* CONCURRENCY NOTES:
96 
97    Understanding who is the owner of the 'struct pthread' or 'PD'
98    (refers to the value of the 'struct pthread *pd' function argument)
99    is critically important in determining exactly which operations are
100    allowed and which are not and when, particularly when it comes to the
101    implementation of pthread_create, pthread_join, pthread_detach, and
102    other functions which all operate on PD.
103 
104    The owner of PD is responsible for freeing the final resources
105    associated with PD, and may examine the memory underlying PD at any
106    point in time until it frees it back to the OS or to reuse by the
107    runtime.
108 
109    The thread which calls pthread_create is called the creating thread.
110    The creating thread begins as the owner of PD.
111 
112    During startup the new thread may examine PD in coordination with the
113    owner thread (which may be itself).
114 
115    The four cases of ownership transfer are:
116 
117    (1) Ownership of PD is released to the process (all threads may use it)
118        after the new thread starts in a joinable state
119        i.e. pthread_create returns a usable pthread_t.
120 
121    (2) Ownership of PD is released to the new thread starting in a detached
122        state.
123 
124    (3) Ownership of PD is dynamically released to a running thread via
125        pthread_detach.
126 
127    (4) Ownership of PD is acquired by the thread which calls pthread_join.
128 
129    Implementation notes:
130 
131    The PD->stopped_start and thread_ran variables are used to determine
132    exactly which of the four ownership states we are in and therefore
133    what actions can be taken.  For example after (2) we cannot read or
134    write from PD anymore since the thread may no longer exist and the
135    memory may be unmapped.
136 
137    It is important to point out that PD->lock is being used both
138    similar to a one-shot semaphore and subsequently as a mutex.  The
139    lock is taken in the parent to force the child to wait, and then the
140    child releases the lock.  However, this semaphore-like effect is used
141    only for synchronizing the parent and child.  After startup the lock
142    is used like a mutex to create a critical section during which a
143    single owner modifies the thread parameters.
144 
145    The most complicated cases happen during thread startup:
146 
147    (a) If the created thread is in a detached (PTHREAD_CREATE_DETACHED),
148        or joinable (default PTHREAD_CREATE_JOINABLE) state and
149        STOPPED_START is true, then the creating thread has ownership of
150        PD until the PD->lock is released by pthread_create.  If any
151        errors occur we are in states (c) or (d) below.
152 
153    (b) If the created thread is in a detached state
154        (PTHREAD_CREATED_DETACHED), and STOPPED_START is false, then the
155        creating thread has ownership of PD until it invokes the OS
156        kernel's thread creation routine.  If this routine returns
157        without error, then the created thread owns PD; otherwise, see
158        (c) or (d) below.
159 
160    (c) If either a joinable or detached thread setup failed and THREAD_RAN
161        is true, then the creating thread releases ownership to the new thread,
162        the created thread sees the failed setup through PD->setup_failed
163        member, releases the PD ownership, and exits.  The creating thread will
164        be responsible for cleanup the allocated resources.  The THREAD_RAN is
165        local to creating thread and indicate whether thread creation or setup
166        has failed.
167 
168    (d) If the thread creation failed and THREAD_RAN is false (meaning
169        ARCH_CLONE has failed), then the creating thread retains ownership
170        of PD and must cleanup he allocated resource.  No waiting for the new
171        thread is required because it never started.
172 
173    The nptl_db interface:
174 
175    The interface with nptl_db requires that we enqueue PD into a linked
176    list and then call a function which the debugger will trap.  The PD
177    will then be dequeued and control returned to the thread.  The caller
178    at the time must have ownership of PD and such ownership remains
179    after control returns to thread. The enqueued PD is removed from the
180    linked list by the nptl_db callback td_thr_event_getmsg.  The debugger
181    must ensure that the thread does not resume execution, otherwise
182    ownership of PD may be lost and examining PD will not be possible.
183 
184    Note that the GNU Debugger as of (December 10th 2015) commit
185    c2c2a31fdb228d41ce3db62b268efea04bd39c18 no longer uses
186    td_thr_event_getmsg and several other related nptl_db interfaces. The
187    principal reason for this is that nptl_db does not support non-stop
188    mode where other threads can run concurrently and modify runtime
189    structures currently in use by the debugger and the nptl_db
190    interface.
191 
192    Axioms:
193 
194    * The create_thread function can never set stopped_start to false.
195    * The created thread can read stopped_start but never write to it.
196    * The variable thread_ran is set some time after the OS thread
197      creation routine returns, how much time after the thread is created
198      is unspecified, but it should be as quickly as possible.
199 
200 */
201 
202 /* CREATE THREAD NOTES:
203 
204    create_thread must initialize PD->stopped_start.  It should be true
205    if the STOPPED_START parameter is true, or if create_thread needs the
206    new thread to synchronize at startup for some other implementation
207    reason.  If STOPPED_START will be true, then create_thread is obliged
208    to lock PD->lock before starting the thread.  Then pthread_create
209    unlocks PD->lock which synchronizes-with create_thread in the
210    child thread which does an acquire/release of PD->lock as the last
211    action before calling the user entry point.  The goal of all of this
212    is to ensure that the required initial thread attributes are applied
213    (by the creating thread) before the new thread runs user code.  Note
214    that the the functions pthread_getschedparam, pthread_setschedparam,
215    pthread_setschedprio, __pthread_tpp_change_priority, and
216    __pthread_current_priority reuse the same lock, PD->lock, for a
217    similar purpose e.g. synchronizing the setting of similar thread
218    attributes.  These functions are never called before the thread is
219    created, so don't participate in startup syncronization, but given
220    that the lock is present already and in the unlocked state, reusing
221    it saves space.
222 
223    The return value is zero for success or an errno code for failure.
224    If the return value is ENOMEM, that will be translated to EAGAIN,
225    so create_thread need not do that.  On failure, *THREAD_RAN should
226    be set to true iff the thread actually started up but before calling
227    the user code (*PD->start_routine).  */
228 
229 static int _Noreturn start_thread (void *arg);
230 
create_thread(struct pthread * pd,const struct pthread_attr * attr,bool * stopped_start,void * stackaddr,size_t stacksize,bool * thread_ran)231 static int create_thread (struct pthread *pd, const struct pthread_attr *attr,
232 			  bool *stopped_start, void *stackaddr,
233 			  size_t stacksize, bool *thread_ran)
234 {
235   /* Determine whether the newly created threads has to be started
236      stopped since we have to set the scheduling parameters or set the
237      affinity.  */
238   bool need_setaffinity = (attr != NULL && attr->extension != NULL
239 			   && attr->extension->cpuset != 0);
240   if (attr != NULL
241       && (__glibc_unlikely (need_setaffinity)
242 	  || __glibc_unlikely ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)))
243     *stopped_start = true;
244 
245   pd->stopped_start = *stopped_start;
246   if (__glibc_unlikely (*stopped_start))
247     lll_lock (pd->lock, LLL_PRIVATE);
248 
249   /* We rely heavily on various flags the CLONE function understands:
250 
251      CLONE_VM, CLONE_FS, CLONE_FILES
252 	These flags select semantics with shared address space and
253 	file descriptors according to what POSIX requires.
254 
255      CLONE_SIGHAND, CLONE_THREAD
256 	This flag selects the POSIX signal semantics and various
257 	other kinds of sharing (itimers, POSIX timers, etc.).
258 
259      CLONE_SETTLS
260 	The sixth parameter to CLONE determines the TLS area for the
261 	new thread.
262 
263      CLONE_PARENT_SETTID
264 	The kernels writes the thread ID of the newly created thread
265 	into the location pointed to by the fifth parameters to CLONE.
266 
267 	Note that it would be semantically equivalent to use
268 	CLONE_CHILD_SETTID but it is be more expensive in the kernel.
269 
270      CLONE_CHILD_CLEARTID
271 	The kernels clears the thread ID of a thread that has called
272 	sys_exit() in the location pointed to by the seventh parameter
273 	to CLONE.
274 
275      The termination signal is chosen to be zero which means no signal
276      is sent.  */
277   const int clone_flags = (CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SYSVSEM
278 			   | CLONE_SIGHAND | CLONE_THREAD
279 			   | CLONE_SETTLS | CLONE_PARENT_SETTID
280 			   | CLONE_CHILD_CLEARTID
281 			   | 0);
282 
283   TLS_DEFINE_INIT_TP (tp, pd);
284 
285   struct clone_args args =
286     {
287       .flags = clone_flags,
288       .pidfd = (uintptr_t) &pd->tid,
289       .parent_tid = (uintptr_t) &pd->tid,
290       .child_tid = (uintptr_t) &pd->tid,
291       .stack = (uintptr_t) stackaddr,
292       .stack_size = stacksize,
293       .tls = (uintptr_t) tp,
294     };
295   int ret = __clone_internal (&args, &start_thread, pd);
296   if (__glibc_unlikely (ret == -1))
297     return errno;
298 
299   /* It's started now, so if we fail below, we'll have to let it clean itself
300      up.  */
301   *thread_ran = true;
302 
303   /* Now we have the possibility to set scheduling parameters etc.  */
304   if (attr != NULL)
305     {
306       /* Set the affinity mask if necessary.  */
307       if (need_setaffinity)
308 	{
309 	  assert (*stopped_start);
310 
311 	  int res = INTERNAL_SYSCALL_CALL (sched_setaffinity, pd->tid,
312 					   attr->extension->cpusetsize,
313 					   attr->extension->cpuset);
314 	  if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res)))
315 	    return INTERNAL_SYSCALL_ERRNO (res);
316 	}
317 
318       /* Set the scheduling parameters.  */
319       if ((attr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0)
320 	{
321 	  assert (*stopped_start);
322 
323 	  int res = INTERNAL_SYSCALL_CALL (sched_setscheduler, pd->tid,
324 					   pd->schedpolicy, &pd->schedparam);
325 	  if (__glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (res)))
326 	    return INTERNAL_SYSCALL_ERRNO (res);
327 	}
328     }
329 
330   return 0;
331 }
332 
333 /* Local function to start thread and handle cleanup.  */
334 static int _Noreturn
start_thread(void * arg)335 start_thread (void *arg)
336 {
337   struct pthread *pd = arg;
338 
339   /* We are either in (a) or (b), and in either case we either own PD already
340      (2) or are about to own PD (1), and so our only restriction would be that
341      we can't free PD until we know we have ownership (see CONCURRENCY NOTES
342      above).  */
343   if (pd->stopped_start)
344     {
345       bool setup_failed = false;
346 
347       /* Get the lock the parent locked to force synchronization.  */
348       lll_lock (pd->lock, LLL_PRIVATE);
349 
350       /* We have ownership of PD now, for detached threads with setup failure
351 	 we set it as joinable so the creating thread could synchronous join
352          and free any resource prior return to the pthread_create caller.  */
353       setup_failed = pd->setup_failed == 1;
354       if (setup_failed)
355 	pd->joinid = NULL;
356 
357       /* And give it up right away.  */
358       lll_unlock (pd->lock, LLL_PRIVATE);
359 
360       if (setup_failed)
361 	goto out;
362     }
363 
364   /* Initialize resolver state pointer.  */
365   __resp = &pd->res;
366 
367   /* Initialize pointers to locale data.  */
368   __ctype_init ();
369 
370   /* Register rseq TLS to the kernel.  */
371   {
372     bool do_rseq = THREAD_GETMEM (pd, flags) & ATTR_FLAG_DO_RSEQ;
373     if (!rseq_register_current_thread (pd, do_rseq) && do_rseq)
374       __libc_fatal ("Fatal glibc error: rseq registration failed\n");
375   }
376 
377 #ifndef __ASSUME_SET_ROBUST_LIST
378   if (__nptl_set_robust_list_avail)
379 #endif
380     {
381       /* This call should never fail because the initial call in init.c
382 	 succeeded.  */
383       INTERNAL_SYSCALL_CALL (set_robust_list, &pd->robust_head,
384 			     sizeof (struct robust_list_head));
385     }
386 
387   /* This is where the try/finally block should be created.  For
388      compilers without that support we do use setjmp.  */
389   struct pthread_unwind_buf unwind_buf;
390 
391   int not_first_call;
392   DIAG_PUSH_NEEDS_COMMENT;
393 #if __GNUC_PREREQ (7, 0)
394   /* This call results in a -Wstringop-overflow warning because struct
395      pthread_unwind_buf is smaller than jmp_buf.  setjmp and longjmp
396      do not use anything beyond the common prefix (they never access
397      the saved signal mask), so that is a false positive.  */
398   DIAG_IGNORE_NEEDS_COMMENT (11, "-Wstringop-overflow=");
399 #endif
400   not_first_call = setjmp ((struct __jmp_buf_tag *) unwind_buf.cancel_jmp_buf);
401   DIAG_POP_NEEDS_COMMENT;
402 
403   /* No previous handlers.  NB: This must be done after setjmp since the
404      private space in the unwind jump buffer may overlap space used by
405      setjmp to store extra architecture-specific information which is
406      never used by the cancellation-specific __libc_unwind_longjmp.
407 
408      The private space is allowed to overlap because the unwinder never
409      has to return through any of the jumped-to call frames, and thus
410      only a minimum amount of saved data need be stored, and for example,
411      need not include the process signal mask information. This is all
412      an optimization to reduce stack usage when pushing cancellation
413      handlers.  */
414   unwind_buf.priv.data.prev = NULL;
415   unwind_buf.priv.data.cleanup = NULL;
416 
417   /* Allow setxid from now onwards.  */
418   if (__glibc_unlikely (atomic_exchange_acq (&pd->setxid_futex, 0) == -2))
419     futex_wake (&pd->setxid_futex, 1, FUTEX_PRIVATE);
420 
421   if (__glibc_likely (! not_first_call))
422     {
423       /* Store the new cleanup handler info.  */
424       THREAD_SETMEM (pd, cleanup_jmp_buf, &unwind_buf);
425 
426       internal_signal_restore_set (&pd->sigmask);
427 
428       LIBC_PROBE (pthread_start, 3, (pthread_t) pd, pd->start_routine, pd->arg);
429 
430       /* Run the code the user provided.  */
431       void *ret;
432       if (pd->c11)
433 	{
434 	  /* The function pointer of the c11 thread start is cast to an incorrect
435 	     type on __pthread_create_2_1 call, however it is casted back to correct
436 	     one so the call behavior is well-defined (it is assumed that pointers
437 	     to void are able to represent all values of int.  */
438 	  int (*start)(void*) = (int (*) (void*)) pd->start_routine;
439 	  ret = (void*) (uintptr_t) start (pd->arg);
440 	}
441       else
442 	ret = pd->start_routine (pd->arg);
443       THREAD_SETMEM (pd, result, ret);
444     }
445 
446   /* Call destructors for the thread_local TLS variables.  */
447 #ifndef SHARED
448   if (&__call_tls_dtors != NULL)
449 #endif
450     __call_tls_dtors ();
451 
452   /* Run the destructor for the thread-local data.  */
453   __nptl_deallocate_tsd ();
454 
455   /* Clean up any state libc stored in thread-local variables.  */
456   __libc_thread_freeres ();
457 
458   /* Report the death of the thread if this is wanted.  */
459   if (__glibc_unlikely (pd->report_events))
460     {
461       /* See whether TD_DEATH is in any of the mask.  */
462       const int idx = __td_eventword (TD_DEATH);
463       const uint32_t mask = __td_eventmask (TD_DEATH);
464 
465       if ((mask & (__nptl_threads_events.event_bits[idx]
466 		   | pd->eventbuf.eventmask.event_bits[idx])) != 0)
467 	{
468 	  /* Yep, we have to signal the death.  Add the descriptor to
469 	     the list but only if it is not already on it.  */
470 	  if (pd->nextevent == NULL)
471 	    {
472 	      pd->eventbuf.eventnum = TD_DEATH;
473 	      pd->eventbuf.eventdata = pd;
474 
475 	      do
476 		pd->nextevent = __nptl_last_event;
477 	      while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
478 							   pd, pd->nextevent));
479 	    }
480 
481 	  /* Now call the function which signals the event.  See
482 	     CONCURRENCY NOTES for the nptl_db interface comments.  */
483 	  __nptl_death_event ();
484 	}
485     }
486 
487   /* The thread is exiting now.  Don't set this bit until after we've hit
488      the event-reporting breakpoint, so that td_thr_get_info on us while at
489      the breakpoint reports TD_THR_RUN state rather than TD_THR_ZOMBIE.  */
490   atomic_bit_set (&pd->cancelhandling, EXITING_BIT);
491 
492   if (__glibc_unlikely (atomic_decrement_and_test (&__nptl_nthreads)))
493     /* This was the last thread.  */
494     exit (0);
495 
496   /* This prevents sending a signal from this thread to itself during
497      its final stages.  This must come after the exit call above
498      because atexit handlers must not run with signals blocked.
499 
500      Do not block SIGSETXID.  The setxid handshake below expects the
501      signal to be delivered.  (SIGSETXID cannot run application code,
502      nor does it use pthread_kill.)  Reuse the pd->sigmask space for
503      computing the signal mask, to save stack space.  */
504   internal_sigfillset (&pd->sigmask);
505   internal_sigdelset (&pd->sigmask, SIGSETXID);
506   INTERNAL_SYSCALL_CALL (rt_sigprocmask, SIG_BLOCK, &pd->sigmask, NULL,
507 			 __NSIG_BYTES);
508 
509   /* Tell __pthread_kill_internal that this thread is about to exit.
510      If there is a __pthread_kill_internal in progress, this delays
511      the thread exit until the signal has been queued by the kernel
512      (so that the TID used to send it remains valid).  */
513   __libc_lock_lock (pd->exit_lock);
514   pd->exiting = true;
515   __libc_lock_unlock (pd->exit_lock);
516 
517 #ifndef __ASSUME_SET_ROBUST_LIST
518   /* If this thread has any robust mutexes locked, handle them now.  */
519 # if __PTHREAD_MUTEX_HAVE_PREV
520   void *robust = pd->robust_head.list;
521 # else
522   __pthread_slist_t *robust = pd->robust_list.__next;
523 # endif
524   /* We let the kernel do the notification if it is able to do so.
525      If we have to do it here there for sure are no PI mutexes involved
526      since the kernel support for them is even more recent.  */
527   if (!__nptl_set_robust_list_avail
528       && __builtin_expect (robust != (void *) &pd->robust_head, 0))
529     {
530       do
531 	{
532 	  struct __pthread_mutex_s *this = (struct __pthread_mutex_s *)
533 	    ((char *) robust - offsetof (struct __pthread_mutex_s,
534 					 __list.__next));
535 	  robust = *((void **) robust);
536 
537 # if __PTHREAD_MUTEX_HAVE_PREV
538 	  this->__list.__prev = NULL;
539 # endif
540 	  this->__list.__next = NULL;
541 
542 	  atomic_or (&this->__lock, FUTEX_OWNER_DIED);
543 	  futex_wake ((unsigned int *) &this->__lock, 1,
544 		      /* XYZ */ FUTEX_SHARED);
545 	}
546       while (robust != (void *) &pd->robust_head);
547     }
548 #endif
549 
550   if (!pd->user_stack)
551     advise_stack_range (pd->stackblock, pd->stackblock_size, (uintptr_t) pd,
552 			pd->guardsize);
553 
554   if (__glibc_unlikely (pd->cancelhandling & SETXID_BITMASK))
555     {
556       /* Some other thread might call any of the setXid functions and expect
557 	 us to reply.  In this case wait until we did that.  */
558       do
559 	/* XXX This differs from the typical futex_wait_simple pattern in that
560 	   the futex_wait condition (setxid_futex) is different from the
561 	   condition used in the surrounding loop (cancelhandling).  We need
562 	   to check and document why this is correct.  */
563 	futex_wait_simple (&pd->setxid_futex, 0, FUTEX_PRIVATE);
564       while (pd->cancelhandling & SETXID_BITMASK);
565 
566       /* Reset the value so that the stack can be reused.  */
567       pd->setxid_futex = 0;
568     }
569 
570   /* If the thread is detached free the TCB.  */
571   if (IS_DETACHED (pd))
572     /* Free the TCB.  */
573     __nptl_free_tcb (pd);
574 
575 out:
576   /* We cannot call '_exit' here.  '_exit' will terminate the process.
577 
578      The 'exit' implementation in the kernel will signal when the
579      process is really dead since 'clone' got passed the CLONE_CHILD_CLEARTID
580      flag.  The 'tid' field in the TCB will be set to zero.
581 
582      rseq TLS is still registered at this point.  Rely on implicit
583      unregistration performed by the kernel on thread teardown.  This is not a
584      problem because the rseq TLS lives on the stack, and the stack outlives
585      the thread.  If TCB allocation is ever changed, additional steps may be
586      required, such as performing explicit rseq unregistration before
587      reclaiming the rseq TLS area memory.  It is NOT sufficient to block
588      signals because the kernel may write to the rseq area even without
589      signals.
590 
591      The exit code is zero since in case all threads exit by calling
592      'pthread_exit' the exit status must be 0 (zero).  */
593   while (1)
594     INTERNAL_SYSCALL_CALL (exit, 0);
595 
596   /* NOTREACHED */
597 }
598 
599 
600 /* Return true iff obliged to report TD_CREATE events.  */
601 static bool
report_thread_creation(struct pthread * pd)602 report_thread_creation (struct pthread *pd)
603 {
604   if (__glibc_unlikely (THREAD_GETMEM (THREAD_SELF, report_events)))
605     {
606       /* The parent thread is supposed to report events.
607 	 Check whether the TD_CREATE event is needed, too.  */
608       const size_t idx = __td_eventword (TD_CREATE);
609       const uint32_t mask = __td_eventmask (TD_CREATE);
610 
611       return ((mask & (__nptl_threads_events.event_bits[idx]
612 		       | pd->eventbuf.eventmask.event_bits[idx])) != 0);
613     }
614   return false;
615 }
616 
617 
618 int
__pthread_create_2_1(pthread_t * newthread,const pthread_attr_t * attr,void * (* start_routine)(void *),void * arg)619 __pthread_create_2_1 (pthread_t *newthread, const pthread_attr_t *attr,
620 		      void *(*start_routine) (void *), void *arg)
621 {
622   void *stackaddr = NULL;
623   size_t stacksize = 0;
624 
625   /* Avoid a data race in the multi-threaded case, and call the
626      deferred initialization only once.  */
627   if (__libc_single_threaded_internal)
628     {
629       late_init ();
630       __libc_single_threaded_internal = 0;
631       /* __libc_single_threaded can be accessed through copy relocations, so
632 	 it requires to update the external copy.  */
633       __libc_single_threaded = 0;
634     }
635 
636   const struct pthread_attr *iattr = (struct pthread_attr *) attr;
637   union pthread_attr_transparent default_attr;
638   bool destroy_default_attr = false;
639   bool c11 = (attr == ATTR_C11_THREAD);
640   if (iattr == NULL || c11)
641     {
642       int ret = __pthread_getattr_default_np (&default_attr.external);
643       if (ret != 0)
644 	return ret;
645       destroy_default_attr = true;
646       iattr = &default_attr.internal;
647     }
648 
649   struct pthread *pd = NULL;
650   int err = allocate_stack (iattr, &pd, &stackaddr, &stacksize);
651   int retval = 0;
652 
653   if (__glibc_unlikely (err != 0))
654     /* Something went wrong.  Maybe a parameter of the attributes is
655        invalid or we could not allocate memory.  Note we have to
656        translate error codes.  */
657     {
658       retval = err == ENOMEM ? EAGAIN : err;
659       goto out;
660     }
661 
662 
663   /* Initialize the TCB.  All initializations with zero should be
664      performed in 'get_cached_stack'.  This way we avoid doing this if
665      the stack freshly allocated with 'mmap'.  */
666 
667 #if TLS_TCB_AT_TP
668   /* Reference to the TCB itself.  */
669   pd->header.self = pd;
670 
671   /* Self-reference for TLS.  */
672   pd->header.tcb = pd;
673 #endif
674 
675   /* Store the address of the start routine and the parameter.  Since
676      we do not start the function directly the stillborn thread will
677      get the information from its thread descriptor.  */
678   pd->start_routine = start_routine;
679   pd->arg = arg;
680   pd->c11 = c11;
681 
682   /* Copy the thread attribute flags.  */
683   struct pthread *self = THREAD_SELF;
684   pd->flags = ((iattr->flags & ~(ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
685 	       | (self->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)));
686 
687   /* Inherit rseq registration state.  Without seccomp filters, rseq
688      registration will either always fail or always succeed.  */
689   if ((int) THREAD_GETMEM_VOLATILE (self, rseq_area.cpu_id) >= 0)
690     pd->flags |= ATTR_FLAG_DO_RSEQ;
691 
692   /* Initialize the field for the ID of the thread which is waiting
693      for us.  This is a self-reference in case the thread is created
694      detached.  */
695   pd->joinid = iattr->flags & ATTR_FLAG_DETACHSTATE ? pd : NULL;
696 
697   /* The debug events are inherited from the parent.  */
698   pd->eventbuf = self->eventbuf;
699 
700 
701   /* Copy the parent's scheduling parameters.  The flags will say what
702      is valid and what is not.  */
703   pd->schedpolicy = self->schedpolicy;
704   pd->schedparam = self->schedparam;
705 
706   /* Copy the stack guard canary.  */
707 #ifdef THREAD_COPY_STACK_GUARD
708   THREAD_COPY_STACK_GUARD (pd);
709 #endif
710 
711   /* Copy the pointer guard value.  */
712 #ifdef THREAD_COPY_POINTER_GUARD
713   THREAD_COPY_POINTER_GUARD (pd);
714 #endif
715 
716   /* Setup tcbhead.  */
717   tls_setup_tcbhead (pd);
718 
719   /* Verify the sysinfo bits were copied in allocate_stack if needed.  */
720 #ifdef NEED_DL_SYSINFO
721   CHECK_THREAD_SYSINFO (pd);
722 #endif
723 
724   /* Determine scheduling parameters for the thread.  */
725   if (__builtin_expect ((iattr->flags & ATTR_FLAG_NOTINHERITSCHED) != 0, 0)
726       && (iattr->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET)) != 0)
727     {
728       /* Use the scheduling parameters the user provided.  */
729       if (iattr->flags & ATTR_FLAG_POLICY_SET)
730         {
731           pd->schedpolicy = iattr->schedpolicy;
732           pd->flags |= ATTR_FLAG_POLICY_SET;
733         }
734       if (iattr->flags & ATTR_FLAG_SCHED_SET)
735         {
736           /* The values were validated in pthread_attr_setschedparam.  */
737           pd->schedparam = iattr->schedparam;
738           pd->flags |= ATTR_FLAG_SCHED_SET;
739         }
740 
741       if ((pd->flags & (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
742           != (ATTR_FLAG_SCHED_SET | ATTR_FLAG_POLICY_SET))
743         collect_default_sched (pd);
744     }
745 
746   if (__glibc_unlikely (__nptl_nthreads == 1))
747     _IO_enable_locks ();
748 
749   /* Pass the descriptor to the caller.  */
750   *newthread = (pthread_t) pd;
751 
752   LIBC_PROBE (pthread_create, 4, newthread, attr, start_routine, arg);
753 
754   /* One more thread.  We cannot have the thread do this itself, since it
755      might exist but not have been scheduled yet by the time we've returned
756      and need to check the value to behave correctly.  We must do it before
757      creating the thread, in case it does get scheduled first and then
758      might mistakenly think it was the only thread.  In the failure case,
759      we momentarily store a false value; this doesn't matter because there
760      is no kosher thing a signal handler interrupting us right here can do
761      that cares whether the thread count is correct.  */
762   atomic_increment (&__nptl_nthreads);
763 
764   /* Our local value of stopped_start and thread_ran can be accessed at
765      any time. The PD->stopped_start may only be accessed if we have
766      ownership of PD (see CONCURRENCY NOTES above).  */
767   bool stopped_start = false; bool thread_ran = false;
768 
769   /* Block all signals, so that the new thread starts out with
770      signals disabled.  This avoids race conditions in the thread
771      startup.  */
772   internal_sigset_t original_sigmask;
773   internal_signal_block_all (&original_sigmask);
774 
775   if (iattr->extension != NULL && iattr->extension->sigmask_set)
776     /* Use the signal mask in the attribute.  The internal signals
777        have already been filtered by the public
778        pthread_attr_setsigmask_np interface.  */
779     internal_sigset_from_sigset (&pd->sigmask, &iattr->extension->sigmask);
780   else
781     {
782       /* Conceptually, the new thread needs to inherit the signal mask
783 	 of this thread.  Therefore, it needs to restore the saved
784 	 signal mask of this thread, so save it in the startup
785 	 information.  */
786       pd->sigmask = original_sigmask;
787       /* Reset the cancellation signal mask in case this thread is
788 	 running cancellation.  */
789       internal_sigdelset (&pd->sigmask, SIGCANCEL);
790     }
791 
792   /* Start the thread.  */
793   if (__glibc_unlikely (report_thread_creation (pd)))
794     {
795       stopped_start = true;
796 
797       /* We always create the thread stopped at startup so we can
798 	 notify the debugger.  */
799       retval = create_thread (pd, iattr, &stopped_start, stackaddr,
800 			      stacksize, &thread_ran);
801       if (retval == 0)
802 	{
803 	  /* We retain ownership of PD until (a) (see CONCURRENCY NOTES
804 	     above).  */
805 
806 	  /* Assert stopped_start is true in both our local copy and the
807 	     PD copy.  */
808 	  assert (stopped_start);
809 	  assert (pd->stopped_start);
810 
811 	  /* Now fill in the information about the new thread in
812 	     the newly created thread's data structure.  We cannot let
813 	     the new thread do this since we don't know whether it was
814 	     already scheduled when we send the event.  */
815 	  pd->eventbuf.eventnum = TD_CREATE;
816 	  pd->eventbuf.eventdata = pd;
817 
818 	  /* Enqueue the descriptor.  */
819 	  do
820 	    pd->nextevent = __nptl_last_event;
821 	  while (atomic_compare_and_exchange_bool_acq (&__nptl_last_event,
822 						       pd, pd->nextevent)
823 		 != 0);
824 
825 	  /* Now call the function which signals the event.  See
826 	     CONCURRENCY NOTES for the nptl_db interface comments.  */
827 	  __nptl_create_event ();
828 	}
829     }
830   else
831     retval = create_thread (pd, iattr, &stopped_start, stackaddr,
832 			    stacksize, &thread_ran);
833 
834   /* Return to the previous signal mask, after creating the new
835      thread.  */
836   internal_signal_restore_set (&original_sigmask);
837 
838   if (__glibc_unlikely (retval != 0))
839     {
840       if (thread_ran)
841 	/* State (c) and we not have PD ownership (see CONCURRENCY NOTES
842 	   above).  We can assert that STOPPED_START must have been true
843 	   because thread creation didn't fail, but thread attribute setting
844 	   did.  */
845         {
846 	  assert (stopped_start);
847 	  /* Signal the created thread to release PD ownership and early
848 	     exit so it could be joined.  */
849 	  pd->setup_failed = 1;
850 	  lll_unlock (pd->lock, LLL_PRIVATE);
851 
852 	  /* Similar to pthread_join, but since thread creation has failed at
853 	     startup there is no need to handle all the steps.  */
854 	  pid_t tid;
855 	  while ((tid = atomic_load_acquire (&pd->tid)) != 0)
856 	    __futex_abstimed_wait_cancelable64 ((unsigned int *) &pd->tid,
857 						tid, 0, NULL, LLL_SHARED);
858         }
859 
860       /* State (c) or (d) and we have ownership of PD (see CONCURRENCY
861 	 NOTES above).  */
862 
863       /* Oops, we lied for a second.  */
864       atomic_decrement (&__nptl_nthreads);
865 
866       /* Free the resources.  */
867       __nptl_deallocate_stack (pd);
868 
869       /* We have to translate error codes.  */
870       if (retval == ENOMEM)
871 	retval = EAGAIN;
872     }
873   else
874     {
875       /* We don't know if we have PD ownership.  Once we check the local
876          stopped_start we'll know if we're in state (a) or (b) (see
877 	 CONCURRENCY NOTES above).  */
878       if (stopped_start)
879 	/* State (a), we own PD. The thread blocked on this lock either
880 	   because we're doing TD_CREATE event reporting, or for some
881 	   other reason that create_thread chose.  Now let it run
882 	   free.  */
883 	lll_unlock (pd->lock, LLL_PRIVATE);
884 
885       /* We now have for sure more than one thread.  The main thread might
886 	 not yet have the flag set.  No need to set the global variable
887 	 again if this is what we use.  */
888       THREAD_SETMEM (THREAD_SELF, header.multiple_threads, 1);
889     }
890 
891  out:
892   if (destroy_default_attr)
893     __pthread_attr_destroy (&default_attr.external);
894 
895   return retval;
896 }
897 versioned_symbol (libc, __pthread_create_2_1, pthread_create, GLIBC_2_34);
898 libc_hidden_ver (__pthread_create_2_1, __pthread_create)
899 #ifndef SHARED
900 strong_alias (__pthread_create_2_1, __pthread_create)
901 #endif
902 
903 #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_1, GLIBC_2_34)
904 compat_symbol (libpthread, __pthread_create_2_1, pthread_create, GLIBC_2_1);
905 #endif
906 
907 #if OTHER_SHLIB_COMPAT (libpthread, GLIBC_2_0, GLIBC_2_1)
908 int
__pthread_create_2_0(pthread_t * newthread,const pthread_attr_t * attr,void * (* start_routine)(void *),void * arg)909 __pthread_create_2_0 (pthread_t *newthread, const pthread_attr_t *attr,
910 		      void *(*start_routine) (void *), void *arg)
911 {
912   /* The ATTR attribute is not really of type `pthread_attr_t *'.  It has
913      the old size and access to the new members might crash the program.
914      We convert the struct now.  */
915   struct pthread_attr new_attr;
916 
917   if (attr != NULL)
918     {
919       struct pthread_attr *iattr = (struct pthread_attr *) attr;
920       size_t ps = __getpagesize ();
921 
922       /* Copy values from the user-provided attributes.  */
923       new_attr.schedparam = iattr->schedparam;
924       new_attr.schedpolicy = iattr->schedpolicy;
925       new_attr.flags = iattr->flags;
926 
927       /* Fill in default values for the fields not present in the old
928 	 implementation.  */
929       new_attr.guardsize = ps;
930       new_attr.stackaddr = NULL;
931       new_attr.stacksize = 0;
932       new_attr.extension = NULL;
933 
934       /* We will pass this value on to the real implementation.  */
935       attr = (pthread_attr_t *) &new_attr;
936     }
937 
938   return __pthread_create_2_1 (newthread, attr, start_routine, arg);
939 }
940 compat_symbol (libpthread, __pthread_create_2_0, pthread_create,
941 	       GLIBC_2_0);
942 #endif
943 
944 /* Information for libthread_db.  */
945 
946 #include "../nptl_db/db_info.c"
947 
948 /* If pthread_create is present, libgcc_eh.a and libsupc++.a expects some other POSIX thread
949    functions to be present as well.  */
950 PTHREAD_STATIC_FN_REQUIRE (__pthread_mutex_lock)
951 PTHREAD_STATIC_FN_REQUIRE (__pthread_mutex_trylock)
952 PTHREAD_STATIC_FN_REQUIRE (__pthread_mutex_unlock)
953 
954 PTHREAD_STATIC_FN_REQUIRE (__pthread_once)
955 PTHREAD_STATIC_FN_REQUIRE (__pthread_cancel)
956 
957 PTHREAD_STATIC_FN_REQUIRE (__pthread_key_create)
958 PTHREAD_STATIC_FN_REQUIRE (__pthread_key_delete)
959 PTHREAD_STATIC_FN_REQUIRE (__pthread_setspecific)
960 PTHREAD_STATIC_FN_REQUIRE (__pthread_getspecific)
961