1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <sys/ioctl.h>
4 #include <sys/reboot.h>
5 #include <sys/wait.h>
6 #include <sys/prctl.h>
7 #include <unistd.h>
8 
9 #include "def.h"
10 #include "exit-status.h"
11 #include "fd-util.h"
12 #include "log.h"
13 #include "nspawn-stub-pid1.h"
14 #include "process-util.h"
15 #include "signal-util.h"
16 #include "time-util.h"
17 
reset_environ(const char * new_environment,size_t length)18 static int reset_environ(const char *new_environment, size_t length) {
19         unsigned long start, end;
20 
21         start = (unsigned long) new_environment;
22         end = start + length;
23 
24         if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0)
25                 return -errno;
26 
27         if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0)
28                 return -errno;
29 
30         return 0;
31 }
32 
stub_pid1(sd_id128_t uuid)33 int stub_pid1(sd_id128_t uuid) {
34         enum {
35                 STATE_RUNNING,
36                 STATE_REBOOT,
37                 STATE_POWEROFF,
38         } state = STATE_RUNNING;
39 
40         sigset_t fullmask, oldmask, waitmask;
41         usec_t quit_usec = USEC_INFINITY;
42         pid_t pid;
43         int r;
44 
45         /* The new environment we set up, on the stack. */
46         char new_environment[] =
47                 "container=systemd-nspawn\0"
48                 "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
49 
50         /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
51          * for allowing arbitrary processes run in a container, and still have all zombies reaped. */
52 
53         assert_se(sigfillset(&fullmask) >= 0);
54         assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);
55 
56         pid = fork();
57         if (pid < 0)
58                 return log_error_errno(errno, "Failed to fork child pid: %m");
59 
60         if (pid == 0) {
61                 /* Return in the child */
62                 assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);
63 
64                 if (setsid() < 0)
65                         return log_error_errno(errno, "Failed to become session leader in payload process: %m");
66 
67                 return 0;
68         }
69 
70         reset_all_signal_handlers();
71 
72         log_close();
73         (void) close_all_fds(NULL, 0);
74         log_open();
75 
76         if (ioctl(STDIN_FILENO, TIOCNOTTY) < 0) {
77                 if (errno != ENOTTY)
78                         log_warning_errno(errno, "Unexpected error from TIOCNOTTY ioctl in init stub process, ignoring: %m");
79         } else
80                 log_warning("Expected TIOCNOTTY to fail, but it succeeded in init stub process, ignoring.");
81 
82         /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also,
83          * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ
84          * find them set. */
85         sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX);
86         reset_environ(new_environment, sizeof(new_environment));
87 
88         (void) rename_process("(sd-stubinit)");
89 
90         assert_se(sigemptyset(&waitmask) >= 0);
91         assert_se(sigset_add_many(&waitmask,
92                                   SIGCHLD,          /* posix: process died */
93                                   SIGINT,           /* sysv: ctrl-alt-del */
94                                   SIGRTMIN+3,       /* systemd: halt */
95                                   SIGRTMIN+4,       /* systemd: poweroff */
96                                   SIGRTMIN+5,       /* systemd: reboot */
97                                   SIGRTMIN+6,       /* systemd: kexec */
98                                   SIGRTMIN+13,      /* systemd: halt */
99                                   SIGRTMIN+14,      /* systemd: poweroff */
100                                   SIGRTMIN+15,      /* systemd: reboot */
101                                   SIGRTMIN+16,      /* systemd: kexec */
102                                   -1) >= 0);
103 
104         /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
105          * support reexec/reloading in this stub process. */
106 
107         for (;;) {
108                 siginfo_t si;
109                 usec_t current_usec;
110 
111                 si.si_pid = 0;
112                 r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG);
113                 if (r < 0) {
114                         r = log_error_errno(errno, "Failed to reap children: %m");
115                         goto finish;
116                 }
117 
118                 current_usec = now(CLOCK_MONOTONIC);
119 
120                 if (si.si_pid == pid || current_usec >= quit_usec) {
121 
122                         /* The child we started ourselves died or we reached a timeout. */
123 
124                         if (state == STATE_REBOOT) { /* dispatch a queued reboot */
125                                 (void) reboot(RB_AUTOBOOT);
126                                 r = log_error_errno(errno, "Failed to reboot: %m");
127                                 goto finish;
128 
129                         } else if (state == STATE_POWEROFF)
130                                 (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */
131 
132                         if (si.si_pid == pid && si.si_code == CLD_EXITED)
133                                 r = si.si_status; /* pass on exit code */
134                         else
135                                 r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */
136 
137                         goto finish;
138                 }
139                 if (si.si_pid != 0)
140                         /* We reaped something. Retry until there's nothing more to reap. */
141                         continue;
142 
143                 if (quit_usec == USEC_INFINITY)
144                         r = sigwaitinfo(&waitmask, &si);
145                 else
146                         r = sigtimedwait(&waitmask, &si, TIMESPEC_STORE(quit_usec - current_usec));
147                 if (r < 0) {
148                         if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
149                                 continue;
150                         if (errno == EAGAIN) /* timeout reached */
151                                 continue;
152 
153                         r = log_error_errno(errno, "Failed to wait for signal: %m");
154                         goto finish;
155                 }
156 
157                 if (si.si_signo == SIGCHLD)
158                         continue; /* Let's reap this */
159 
160                 if (state != STATE_RUNNING)
161                         continue;
162 
163                 /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
164                  * constant… */
165 
166                 if (si.si_signo == SIGRTMIN+3 ||
167                     si.si_signo == SIGRTMIN+4 ||
168                     si.si_signo == SIGRTMIN+13 ||
169                     si.si_signo == SIGRTMIN+14)
170 
171                         state = STATE_POWEROFF;
172 
173                 else if (si.si_signo == SIGINT ||
174                          si.si_signo == SIGRTMIN+5 ||
175                          si.si_signo == SIGRTMIN+6 ||
176                          si.si_signo == SIGRTMIN+15 ||
177                          si.si_signo == SIGRTMIN+16)
178 
179                         state = STATE_REBOOT;
180                 else
181                         assert_not_reached();
182 
183                 r = kill_and_sigcont(pid, SIGTERM);
184 
185                 /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We
186                  * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those
187                  * processes which handle both. That's because services tend to bind configuration reload or something
188                  * else to SIGHUP. */
189 
190                 if (r != -ESRCH)
191                         (void) kill(pid, SIGHUP);
192 
193                 quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
194         }
195 
196 finish:
197         _exit(r < 0 ? EXIT_FAILURE : r);
198 }
199