1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #if HAVE_BLKID
4 #endif
5 #include <errno.h>
6 #include <getopt.h>
7 #include <linux/fs.h>
8 #include <linux/loop.h>
9 #if HAVE_SELINUX
10 #include <selinux/selinux.h>
11 #endif
12 #include <stdlib.h>
13 #include <sys/file.h>
14 #include <sys/ioctl.h>
15 #include <sys/personality.h>
16 #include <sys/prctl.h>
17 #include <sys/types.h>
18 #include <sys/wait.h>
19 #include <termios.h>
20 #include <unistd.h>
21 
22 #include "sd-bus.h"
23 #include "sd-daemon.h"
24 #include "sd-id128.h"
25 
26 #include "alloc-util.h"
27 #include "barrier.h"
28 #include "base-filesystem.h"
29 #include "blkid-util.h"
30 #include "btrfs-util.h"
31 #include "bus-error.h"
32 #include "bus-util.h"
33 #include "cap-list.h"
34 #include "capability-util.h"
35 #include "cgroup-util.h"
36 #include "chase-symlinks.h"
37 #include "copy.h"
38 #include "cpu-set-util.h"
39 #include "creds-util.h"
40 #include "dev-setup.h"
41 #include "discover-image.h"
42 #include "dissect-image.h"
43 #include "env-util.h"
44 #include "escape.h"
45 #include "fd-util.h"
46 #include "fdset.h"
47 #include "fileio.h"
48 #include "format-util.h"
49 #include "fs-util.h"
50 #include "gpt.h"
51 #include "hexdecoct.h"
52 #include "hostname-setup.h"
53 #include "hostname-util.h"
54 #include "id128-util.h"
55 #include "io-util.h"
56 #include "log.h"
57 #include "loop-util.h"
58 #include "loopback-setup.h"
59 #include "macro.h"
60 #include "main-func.h"
61 #include "missing_sched.h"
62 #include "mkdir.h"
63 #include "mount-util.h"
64 #include "mountpoint-util.h"
65 #include "namespace-util.h"
66 #include "netlink-util.h"
67 #include "nspawn-bind-user.h"
68 #include "nspawn-cgroup.h"
69 #include "nspawn-creds.h"
70 #include "nspawn-def.h"
71 #include "nspawn-expose-ports.h"
72 #include "nspawn-mount.h"
73 #include "nspawn-network.h"
74 #include "nspawn-oci.h"
75 #include "nspawn-patch-uid.h"
76 #include "nspawn-register.h"
77 #include "nspawn-seccomp.h"
78 #include "nspawn-settings.h"
79 #include "nspawn-setuid.h"
80 #include "nspawn-stub-pid1.h"
81 #include "nspawn-util.h"
82 #include "nspawn.h"
83 #include "nulstr-util.h"
84 #include "os-util.h"
85 #include "pager.h"
86 #include "parse-argument.h"
87 #include "parse-util.h"
88 #include "pretty-print.h"
89 #include "process-util.h"
90 #include "ptyfwd.h"
91 #include "random-util.h"
92 #include "raw-clone.h"
93 #include "resolve-util.h"
94 #include "rlimit-util.h"
95 #include "rm-rf.h"
96 #if HAVE_SECCOMP
97 #include "seccomp-util.h"
98 #endif
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "socket-util.h"
102 #include "stat-util.h"
103 #include "stdio-util.h"
104 #include "string-table.h"
105 #include "string-util.h"
106 #include "strv.h"
107 #include "sysctl-util.h"
108 #include "terminal-util.h"
109 #include "tmpfile-util.h"
110 #include "umask-util.h"
111 #include "unit-name.h"
112 #include "user-util.h"
113 #include "util.h"
114 
115 /* The notify socket inside the container it can use to talk to nspawn using the sd_notify(3) protocol */
116 #define NSPAWN_NOTIFY_SOCKET_PATH "/run/host/notify"
117 
118 #define EXIT_FORCE_RESTART 133
119 
120 typedef enum ContainerStatus {
121         CONTAINER_TERMINATED,
122         CONTAINER_REBOOTED,
123 } ContainerStatus;
124 
125 static char *arg_directory = NULL;
126 static char *arg_template = NULL;
127 static char *arg_chdir = NULL;
128 static char *arg_pivot_root_new = NULL;
129 static char *arg_pivot_root_old = NULL;
130 static char *arg_user = NULL;
131 static uid_t arg_uid = UID_INVALID;
132 static gid_t arg_gid = GID_INVALID;
133 static gid_t* arg_supplementary_gids = NULL;
134 static size_t arg_n_supplementary_gids = 0;
135 static sd_id128_t arg_uuid = {};
136 static char *arg_machine = NULL;     /* The name used by the host to refer to this */
137 static char *arg_hostname = NULL;    /* The name the payload sees by default */
138 static const char *arg_selinux_context = NULL;
139 static const char *arg_selinux_apifs_context = NULL;
140 static char *arg_slice = NULL;
141 static bool arg_private_network = false;
142 static bool arg_read_only = false;
143 static StartMode arg_start_mode = START_PID1;
144 static bool arg_ephemeral = false;
145 static LinkJournal arg_link_journal = LINK_AUTO;
146 static bool arg_link_journal_try = false;
147 static uint64_t arg_caps_retain =
148         (1ULL << CAP_AUDIT_CONTROL) |
149         (1ULL << CAP_AUDIT_WRITE) |
150         (1ULL << CAP_CHOWN) |
151         (1ULL << CAP_DAC_OVERRIDE) |
152         (1ULL << CAP_DAC_READ_SEARCH) |
153         (1ULL << CAP_FOWNER) |
154         (1ULL << CAP_FSETID) |
155         (1ULL << CAP_IPC_OWNER) |
156         (1ULL << CAP_KILL) |
157         (1ULL << CAP_LEASE) |
158         (1ULL << CAP_LINUX_IMMUTABLE) |
159         (1ULL << CAP_MKNOD) |
160         (1ULL << CAP_NET_BIND_SERVICE) |
161         (1ULL << CAP_NET_BROADCAST) |
162         (1ULL << CAP_NET_RAW) |
163         (1ULL << CAP_SETFCAP) |
164         (1ULL << CAP_SETGID) |
165         (1ULL << CAP_SETPCAP) |
166         (1ULL << CAP_SETUID) |
167         (1ULL << CAP_SYS_ADMIN) |
168         (1ULL << CAP_SYS_BOOT) |
169         (1ULL << CAP_SYS_CHROOT) |
170         (1ULL << CAP_SYS_NICE) |
171         (1ULL << CAP_SYS_PTRACE) |
172         (1ULL << CAP_SYS_RESOURCE) |
173         (1ULL << CAP_SYS_TTY_CONFIG);
174 static uint64_t arg_caps_ambient = 0;
175 static CapabilityQuintet arg_full_capabilities = CAPABILITY_QUINTET_NULL;
176 static CustomMount *arg_custom_mounts = NULL;
177 static size_t arg_n_custom_mounts = 0;
178 static char **arg_setenv = NULL;
179 static bool arg_quiet = false;
180 static bool arg_register = true;
181 static bool arg_keep_unit = false;
182 static char **arg_network_interfaces = NULL;
183 static char **arg_network_macvlan = NULL;
184 static char **arg_network_ipvlan = NULL;
185 static bool arg_network_veth = false;
186 static char **arg_network_veth_extra = NULL;
187 static char *arg_network_bridge = NULL;
188 static char *arg_network_zone = NULL;
189 static char *arg_network_namespace_path = NULL;
190 static PagerFlags arg_pager_flags = 0;
191 static unsigned long arg_personality = PERSONALITY_INVALID;
192 static char *arg_image = NULL;
193 static char *arg_oci_bundle = NULL;
194 static VolatileMode arg_volatile_mode = VOLATILE_NO;
195 static ExposePort *arg_expose_ports = NULL;
196 static char **arg_property = NULL;
197 static sd_bus_message *arg_property_message = NULL;
198 static UserNamespaceMode arg_userns_mode = USER_NAMESPACE_NO;
199 static uid_t arg_uid_shift = UID_INVALID, arg_uid_range = 0x10000U;
200 static UserNamespaceOwnership arg_userns_ownership = _USER_NAMESPACE_OWNERSHIP_INVALID;
201 static int arg_kill_signal = 0;
202 static CGroupUnified arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_UNKNOWN;
203 static SettingsMask arg_settings_mask = 0;
204 static int arg_settings_trusted = -1;
205 static char **arg_parameters = NULL;
206 static const char *arg_container_service_name = "systemd-nspawn";
207 static bool arg_notify_ready = false;
208 static bool arg_use_cgns = true;
209 static unsigned long arg_clone_ns_flags = CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS;
210 static MountSettingsMask arg_mount_settings = MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_TMPFS_TMP;
211 static VeritySettings arg_verity_settings = VERITY_SETTINGS_DEFAULT;
212 static char **arg_syscall_allow_list = NULL;
213 static char **arg_syscall_deny_list = NULL;
214 #if HAVE_SECCOMP
215 static scmp_filter_ctx arg_seccomp = NULL;
216 #endif
217 static struct rlimit *arg_rlimit[_RLIMIT_MAX] = {};
218 static bool arg_no_new_privileges = false;
219 static int arg_oom_score_adjust = 0;
220 static bool arg_oom_score_adjust_set = false;
221 static CPUSet arg_cpu_set = {};
222 static ResolvConfMode arg_resolv_conf = RESOLV_CONF_AUTO;
223 static TimezoneMode arg_timezone = TIMEZONE_AUTO;
224 static unsigned arg_console_width = UINT_MAX, arg_console_height = UINT_MAX;
225 static DeviceNode* arg_extra_nodes = NULL;
226 static size_t arg_n_extra_nodes = 0;
227 static char **arg_sysctl = NULL;
228 static ConsoleMode arg_console_mode = _CONSOLE_MODE_INVALID;
229 static Credential *arg_credentials = NULL;
230 static size_t arg_n_credentials = 0;
231 static char **arg_bind_user = NULL;
232 static bool arg_suppress_sync = false;
233 static char *arg_settings_filename = NULL;
234 
235 STATIC_DESTRUCTOR_REGISTER(arg_directory, freep);
236 STATIC_DESTRUCTOR_REGISTER(arg_template, freep);
237 STATIC_DESTRUCTOR_REGISTER(arg_chdir, freep);
238 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_new, freep);
239 STATIC_DESTRUCTOR_REGISTER(arg_pivot_root_old, freep);
240 STATIC_DESTRUCTOR_REGISTER(arg_user, freep);
241 STATIC_DESTRUCTOR_REGISTER(arg_supplementary_gids, freep);
242 STATIC_DESTRUCTOR_REGISTER(arg_machine, freep);
243 STATIC_DESTRUCTOR_REGISTER(arg_hostname, freep);
244 STATIC_DESTRUCTOR_REGISTER(arg_slice, freep);
245 STATIC_DESTRUCTOR_REGISTER(arg_setenv, strv_freep);
246 STATIC_DESTRUCTOR_REGISTER(arg_network_interfaces, strv_freep);
247 STATIC_DESTRUCTOR_REGISTER(arg_network_macvlan, strv_freep);
248 STATIC_DESTRUCTOR_REGISTER(arg_network_ipvlan, strv_freep);
249 STATIC_DESTRUCTOR_REGISTER(arg_network_veth_extra, strv_freep);
250 STATIC_DESTRUCTOR_REGISTER(arg_network_bridge, freep);
251 STATIC_DESTRUCTOR_REGISTER(arg_network_zone, freep);
252 STATIC_DESTRUCTOR_REGISTER(arg_network_namespace_path, freep);
253 STATIC_DESTRUCTOR_REGISTER(arg_image, freep);
254 STATIC_DESTRUCTOR_REGISTER(arg_oci_bundle, freep);
255 STATIC_DESTRUCTOR_REGISTER(arg_property, strv_freep);
256 STATIC_DESTRUCTOR_REGISTER(arg_property_message, sd_bus_message_unrefp);
257 STATIC_DESTRUCTOR_REGISTER(arg_parameters, strv_freep);
258 STATIC_DESTRUCTOR_REGISTER(arg_verity_settings, verity_settings_done);
259 STATIC_DESTRUCTOR_REGISTER(arg_syscall_allow_list, strv_freep);
260 STATIC_DESTRUCTOR_REGISTER(arg_syscall_deny_list, strv_freep);
261 #if HAVE_SECCOMP
262 STATIC_DESTRUCTOR_REGISTER(arg_seccomp, seccomp_releasep);
263 #endif
264 STATIC_DESTRUCTOR_REGISTER(arg_cpu_set, cpu_set_reset);
265 STATIC_DESTRUCTOR_REGISTER(arg_sysctl, strv_freep);
266 STATIC_DESTRUCTOR_REGISTER(arg_bind_user, strv_freep);
267 STATIC_DESTRUCTOR_REGISTER(arg_settings_filename, freep);
268 
handle_arg_console(const char * arg)269 static int handle_arg_console(const char *arg) {
270         if (streq(arg, "help")) {
271                 puts("autopipe\n"
272                      "interactive\n"
273                      "passive\n"
274                      "pipe\n"
275                      "read-only");
276                 return 0;
277         }
278 
279         if (streq(arg, "interactive"))
280                 arg_console_mode = CONSOLE_INTERACTIVE;
281         else if (streq(arg, "read-only"))
282                 arg_console_mode = CONSOLE_READ_ONLY;
283         else if (streq(arg, "passive"))
284                 arg_console_mode = CONSOLE_PASSIVE;
285         else if (streq(arg, "pipe")) {
286                 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
287                         log_full(arg_quiet ? LOG_DEBUG : LOG_NOTICE,
288                                  "Console mode 'pipe' selected, but standard input/output are connected to an interactive TTY. "
289                                  "Most likely you want to use 'interactive' console mode for proper interactivity and shell job control. "
290                                  "Proceeding anyway.");
291 
292                 arg_console_mode = CONSOLE_PIPE;
293         } else if (streq(arg, "autopipe")) {
294                 if (isatty(STDIN_FILENO) > 0 && isatty(STDOUT_FILENO) > 0)
295                         arg_console_mode = CONSOLE_INTERACTIVE;
296                 else
297                         arg_console_mode = CONSOLE_PIPE;
298         } else
299                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Unknown console mode: %s", optarg);
300 
301         arg_settings_mask |= SETTING_CONSOLE_MODE;
302         return 1;
303 }
304 
help(void)305 static int help(void) {
306         _cleanup_free_ char *link = NULL;
307         int r;
308 
309         pager_open(arg_pager_flags);
310 
311         r = terminal_urlify_man("systemd-nspawn", "1", &link);
312         if (r < 0)
313                 return log_oom();
314 
315         printf("%1$s [OPTIONS...] [PATH] [ARGUMENTS...]\n\n"
316                "%5$sSpawn a command or OS in a light-weight container.%6$s\n\n"
317                "  -h --help                 Show this help\n"
318                "     --version              Print version string\n"
319                "  -q --quiet                Do not show status information\n"
320                "     --no-pager             Do not pipe output into a pager\n"
321                "     --settings=BOOLEAN     Load additional settings from .nspawn file\n\n"
322                "%3$sImage:%4$s\n"
323                "  -D --directory=PATH       Root directory for the container\n"
324                "     --template=PATH        Initialize root directory from template directory,\n"
325                "                            if missing\n"
326                "  -x --ephemeral            Run container with snapshot of root directory, and\n"
327                "                            remove it after exit\n"
328                "  -i --image=PATH           Root file system disk image (or device node) for\n"
329                "                            the container\n"
330                "     --oci-bundle=PATH      OCI bundle directory\n"
331                "     --read-only            Mount the root directory read-only\n"
332                "     --volatile[=MODE]      Run the system in volatile mode\n"
333                "     --root-hash=HASH       Specify verity root hash for root disk image\n"
334                "     --root-hash-sig=SIG    Specify pkcs7 signature of root hash for verity\n"
335                "                            as a DER encoded PKCS7, either as a path to a file\n"
336                "                            or as an ASCII base64 encoded string prefixed by\n"
337                "                            'base64:'\n"
338                "     --verity-data=PATH     Specify hash device for verity\n"
339                "     --pivot-root=PATH[:PATH]\n"
340                "                            Pivot root to given directory in the container\n\n"
341                "%3$sExecution:%4$s\n"
342                "  -a --as-pid2              Maintain a stub init as PID1, invoke binary as PID2\n"
343                "  -b --boot                 Boot up full system (i.e. invoke init)\n"
344                "     --chdir=PATH           Set working directory in the container\n"
345                "  -E --setenv=NAME[=VALUE]  Pass an environment variable to PID 1\n"
346                "  -u --user=USER            Run the command under specified user or UID\n"
347                "     --kill-signal=SIGNAL   Select signal to use for shutting down PID 1\n"
348                "     --notify-ready=BOOLEAN Receive notifications from the child init process\n"
349                "     --suppress-sync=BOOLEAN\n"
350                "                            Suppress any form of disk data synchronization\n\n"
351                "%3$sSystem Identity:%4$s\n"
352                "  -M --machine=NAME         Set the machine name for the container\n"
353                "     --hostname=NAME        Override the hostname for the container\n"
354                "     --uuid=UUID            Set a specific machine UUID for the container\n\n"
355                "%3$sProperties:%4$s\n"
356                "  -S --slice=SLICE          Place the container in the specified slice\n"
357                "     --property=NAME=VALUE  Set scope unit property\n"
358                "     --register=BOOLEAN     Register container as machine\n"
359                "     --keep-unit            Do not register a scope for the machine, reuse\n"
360                "                            the service unit nspawn is running in\n\n"
361                "%3$sUser Namespacing:%4$s\n"
362                "  -U --private-users=pick   Run within user namespace, autoselect UID/GID range\n"
363                "     --private-users[=UIDBASE[:NUIDS]]\n"
364                "                            Similar, but with user configured UID/GID range\n"
365                "     --private-users-ownership=MODE\n"
366                "                            Adjust ('chown') or map ('map') OS tree ownership\n"
367                "                            to private UID/GID range\n\n"
368                "%3$sNetworking:%4$s\n"
369                "     --private-network      Disable network in container\n"
370                "     --network-interface=INTERFACE\n"
371                "                            Assign an existing network interface to the\n"
372                "                            container\n"
373                "     --network-macvlan=INTERFACE\n"
374                "                            Create a macvlan network interface based on an\n"
375                "                            existing network interface to the container\n"
376                "     --network-ipvlan=INTERFACE\n"
377                "                            Create an ipvlan network interface based on an\n"
378                "                            existing network interface to the container\n"
379                "  -n --network-veth         Add a virtual Ethernet connection between host\n"
380                "                            and container\n"
381                "     --network-veth-extra=HOSTIF[:CONTAINERIF]\n"
382                "                            Add an additional virtual Ethernet link between\n"
383                "                            host and container\n"
384                "     --network-bridge=INTERFACE\n"
385                "                            Add a virtual Ethernet connection to the container\n"
386                "                            and attach it to an existing bridge on the host\n"
387                "     --network-zone=NAME    Similar, but attach the new interface to an\n"
388                "                            an automatically managed bridge interface\n"
389                "     --network-namespace-path=PATH\n"
390                "                            Set network namespace to the one represented by\n"
391                "                            the specified kernel namespace file node\n"
392                "  -p --port=[PROTOCOL:]HOSTPORT[:CONTAINERPORT]\n"
393                "                            Expose a container IP port on the host\n\n"
394                "%3$sSecurity:%4$s\n"
395                "     --capability=CAP       In addition to the default, retain specified\n"
396                "                            capability\n"
397                "     --drop-capability=CAP  Drop the specified capability from the default set\n"
398                "     --ambient-capability=CAP\n"
399                "                            Sets the specified capability for the started\n"
400                "                            process. Not useful if booting a machine.\n"
401                "     --no-new-privileges    Set PR_SET_NO_NEW_PRIVS flag for container payload\n"
402                "     --system-call-filter=LIST|~LIST\n"
403                "                            Permit/prohibit specific system calls\n"
404                "  -Z --selinux-context=SECLABEL\n"
405                "                            Set the SELinux security context to be used by\n"
406                "                            processes in the container\n"
407                "  -L --selinux-apifs-context=SECLABEL\n"
408                "                            Set the SELinux security context to be used by\n"
409                "                            API/tmpfs file systems in the container\n\n"
410                "%3$sResources:%4$s\n"
411                "     --rlimit=NAME=LIMIT    Set a resource limit for the payload\n"
412                "     --oom-score-adjust=VALUE\n"
413                "                            Adjust the OOM score value for the payload\n"
414                "     --cpu-affinity=CPUS    Adjust the CPU affinity of the container\n"
415                "     --personality=ARCH     Pick personality for this container\n\n"
416                "%3$sIntegration:%4$s\n"
417                "     --resolv-conf=MODE     Select mode of /etc/resolv.conf initialization\n"
418                "     --timezone=MODE        Select mode of /etc/localtime initialization\n"
419                "     --link-journal=MODE    Link up guest journal, one of no, auto, guest, \n"
420                "                            host, try-guest, try-host\n"
421                "  -j                        Equivalent to --link-journal=try-guest\n\n"
422                "%3$sMounts:%4$s\n"
423                "     --bind=PATH[:PATH[:OPTIONS]]\n"
424                "                            Bind mount a file or directory from the host into\n"
425                "                            the container\n"
426                "     --bind-ro=PATH[:PATH[:OPTIONS]\n"
427                "                            Similar, but creates a read-only bind mount\n"
428                "     --inaccessible=PATH    Over-mount file node with inaccessible node to mask\n"
429                "                            it\n"
430                "     --tmpfs=PATH:[OPTIONS] Mount an empty tmpfs to the specified directory\n"
431                "     --overlay=PATH[:PATH...]:PATH\n"
432                "                            Create an overlay mount from the host to \n"
433                "                            the container\n"
434                "     --overlay-ro=PATH[:PATH...]:PATH\n"
435                "                            Similar, but creates a read-only overlay mount\n"
436                "     --bind-user=NAME       Bind user from host to container\n\n"
437                "%3$sInput/Output:%4$s\n"
438                "     --console=MODE         Select how stdin/stdout/stderr and /dev/console are\n"
439                "                            set up for the container.\n"
440                "  -P --pipe                 Equivalent to --console=pipe\n\n"
441                "%3$sCredentials:%4$s\n"
442                "     --set-credential=ID:VALUE\n"
443                "                            Pass a credential with literal value to container.\n"
444                "     --load-credential=ID:PATH\n"
445                "                            Load credential to pass to container from file or\n"
446                "                            AF_UNIX stream socket.\n"
447                "\nSee the %2$s for details.\n",
448                program_invocation_short_name,
449                link,
450                ansi_underline(),
451                ansi_normal(),
452                ansi_highlight(),
453                ansi_normal());
454 
455         return 0;
456 }
457 
custom_mount_check_all(void)458 static int custom_mount_check_all(void) {
459         size_t i;
460 
461         for (i = 0; i < arg_n_custom_mounts; i++) {
462                 CustomMount *m = &arg_custom_mounts[i];
463 
464                 if (path_equal(m->destination, "/") && arg_userns_mode != USER_NAMESPACE_NO) {
465                         if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_OFF)
466                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
467                                                        "--private-users-ownership=own may not be combined with custom root mounts.");
468                         if (arg_uid_shift == UID_INVALID)
469                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
470                                                        "--private-users with automatic UID shift may not be combined with custom root mounts.");
471                 }
472         }
473 
474         return 0;
475 }
476 
detect_unified_cgroup_hierarchy_from_environment(void)477 static int detect_unified_cgroup_hierarchy_from_environment(void) {
478         const char *e, *var = "SYSTEMD_NSPAWN_UNIFIED_HIERARCHY";
479         int r;
480 
481         /* Allow the user to control whether the unified hierarchy is used */
482 
483         e = getenv(var);
484         if (!e) {
485                 /* $UNIFIED_CGROUP_HIERARCHY has been renamed to $SYSTEMD_NSPAWN_UNIFIED_HIERARCHY. */
486                 var = "UNIFIED_CGROUP_HIERARCHY";
487                 e = getenv(var);
488         }
489 
490         if (!isempty(e)) {
491                 r = parse_boolean(e);
492                 if (r < 0)
493                         return log_error_errno(r, "Failed to parse $%s: %m", var);
494                 if (r > 0)
495                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
496                 else
497                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
498         }
499 
500         return 0;
501 }
502 
detect_unified_cgroup_hierarchy_from_image(const char * directory)503 static int detect_unified_cgroup_hierarchy_from_image(const char *directory) {
504         int r;
505 
506         /* Let's inherit the mode to use from the host system, but let's take into consideration what systemd
507          * in the image actually supports. */
508         r = cg_all_unified();
509         if (r < 0)
510                 return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
511         if (r > 0) {
512                 /* Unified cgroup hierarchy support was added in 230. Unfortunately the detection
513                  * routine only detects 231, so we'll have a false negative here for 230. */
514                 r = systemd_installation_has_version(directory, "230");
515                 if (r < 0)
516                         return log_error_errno(r, "Failed to determine systemd version in container: %m");
517                 if (r > 0)
518                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
519                 else
520                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
521         } else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0) {
522                 /* Mixed cgroup hierarchy support was added in 233 */
523                 r = systemd_installation_has_version(directory, "233");
524                 if (r < 0)
525                         return log_error_errno(r, "Failed to determine systemd version in container: %m");
526                 if (r > 0)
527                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
528                 else
529                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
530         } else
531                 arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
532 
533         log_debug("Using %s hierarchy for container.",
534                   arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_NONE ? "legacy" :
535                   arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_SYSTEMD ? "hybrid" : "unified");
536 
537         return 0;
538 }
539 
parse_capability_spec(const char * spec,uint64_t * ret_mask)540 static int parse_capability_spec(const char *spec, uint64_t *ret_mask) {
541         uint64_t mask = 0;
542         int r;
543 
544         for (;;) {
545                 _cleanup_free_ char *t = NULL;
546 
547                 r = extract_first_word(&spec, &t, ",", 0);
548                 if (r < 0)
549                         return log_error_errno(r, "Failed to parse capability %s.", t);
550                 if (r == 0)
551                         break;
552 
553                 if (streq(t, "help")) {
554                         for (int i = 0; i < capability_list_length(); i++) {
555                                 const char *name;
556 
557                                 name = capability_to_name(i);
558                                 if (name)
559                                         puts(name);
560                         }
561 
562                         return 0; /* quit */
563                 }
564 
565                 if (streq(t, "all"))
566                         mask = UINT64_MAX;
567                 else {
568                         r = capability_from_name(t);
569                         if (r < 0)
570                                 return log_error_errno(r, "Failed to parse capability %s.", t);
571 
572                         mask |= 1ULL << r;
573                 }
574         }
575 
576         *ret_mask = mask;
577         return 1; /* continue */
578 }
579 
parse_share_ns_env(const char * name,unsigned long ns_flag)580 static int parse_share_ns_env(const char *name, unsigned long ns_flag) {
581         int r;
582 
583         r = getenv_bool(name);
584         if (r == -ENXIO)
585                 return 0;
586         if (r < 0)
587                 return log_error_errno(r, "Failed to parse $%s: %m", name);
588 
589         arg_clone_ns_flags = (arg_clone_ns_flags & ~ns_flag) | (r > 0 ? 0 : ns_flag);
590         arg_settings_mask |= SETTING_CLONE_NS_FLAGS;
591         return 0;
592 }
593 
parse_mount_settings_env(void)594 static int parse_mount_settings_env(void) {
595         const char *e;
596         int r;
597 
598         r = getenv_bool("SYSTEMD_NSPAWN_TMPFS_TMP");
599         if (r < 0 && r != -ENXIO)
600                 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_TMPFS_TMP: %m");
601         if (r >= 0)
602                 SET_FLAG(arg_mount_settings, MOUNT_APPLY_TMPFS_TMP, r > 0);
603 
604         e = getenv("SYSTEMD_NSPAWN_API_VFS_WRITABLE");
605         if (streq_ptr(e, "network"))
606                 arg_mount_settings |= MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS;
607 
608         else if (e) {
609                 r = parse_boolean(e);
610                 if (r < 0)
611                         return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_API_VFS_WRITABLE: %m");
612 
613                 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_RO, r == 0);
614                 SET_FLAG(arg_mount_settings, MOUNT_APPLY_APIVFS_NETNS, false);
615         }
616 
617         return 0;
618 }
619 
parse_environment(void)620 static int parse_environment(void) {
621         const char *e;
622         int r;
623 
624         r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_IPC", CLONE_NEWIPC);
625         if (r < 0)
626                 return r;
627         r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_PID", CLONE_NEWPID);
628         if (r < 0)
629                 return r;
630         r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_NS_UTS", CLONE_NEWUTS);
631         if (r < 0)
632                 return r;
633         r = parse_share_ns_env("SYSTEMD_NSPAWN_SHARE_SYSTEM", CLONE_NEWIPC|CLONE_NEWPID|CLONE_NEWUTS);
634         if (r < 0)
635                 return r;
636 
637         r = parse_mount_settings_env();
638         if (r < 0)
639                 return r;
640 
641         /* SYSTEMD_NSPAWN_USE_CGNS=0 can be used to disable CLONE_NEWCGROUP use,
642          * even if it is supported. If not supported, it has no effect. */
643         if (!cg_ns_supported())
644                 arg_use_cgns = false;
645         else {
646                 r = getenv_bool("SYSTEMD_NSPAWN_USE_CGNS");
647                 if (r < 0) {
648                         if (r != -ENXIO)
649                                 return log_error_errno(r, "Failed to parse $SYSTEMD_NSPAWN_USE_CGNS: %m");
650 
651                         arg_use_cgns = true;
652                 } else {
653                         arg_use_cgns = r > 0;
654                         arg_settings_mask |= SETTING_USE_CGNS;
655                 }
656         }
657 
658         e = getenv("SYSTEMD_NSPAWN_CONTAINER_SERVICE");
659         if (e)
660                 arg_container_service_name = e;
661 
662         r = getenv_bool("SYSTEMD_SUPPRESS_SYNC");
663         if (r >= 0)
664                 arg_suppress_sync = r;
665         else if (r != -ENXIO)
666                 log_debug_errno(r, "Failed to parse $SYSTEMD_SUPPRESS_SYNC, ignoring: %m");
667 
668         return detect_unified_cgroup_hierarchy_from_environment();
669 }
670 
parse_argv(int argc,char * argv[])671 static int parse_argv(int argc, char *argv[]) {
672         enum {
673                 ARG_VERSION = 0x100,
674                 ARG_PRIVATE_NETWORK,
675                 ARG_UUID,
676                 ARG_READ_ONLY,
677                 ARG_CAPABILITY,
678                 ARG_AMBIENT_CAPABILITY,
679                 ARG_DROP_CAPABILITY,
680                 ARG_LINK_JOURNAL,
681                 ARG_BIND,
682                 ARG_BIND_RO,
683                 ARG_TMPFS,
684                 ARG_OVERLAY,
685                 ARG_OVERLAY_RO,
686                 ARG_INACCESSIBLE,
687                 ARG_SHARE_SYSTEM,
688                 ARG_REGISTER,
689                 ARG_KEEP_UNIT,
690                 ARG_NETWORK_INTERFACE,
691                 ARG_NETWORK_MACVLAN,
692                 ARG_NETWORK_IPVLAN,
693                 ARG_NETWORK_BRIDGE,
694                 ARG_NETWORK_ZONE,
695                 ARG_NETWORK_VETH_EXTRA,
696                 ARG_NETWORK_NAMESPACE_PATH,
697                 ARG_PERSONALITY,
698                 ARG_VOLATILE,
699                 ARG_TEMPLATE,
700                 ARG_PROPERTY,
701                 ARG_PRIVATE_USERS,
702                 ARG_KILL_SIGNAL,
703                 ARG_SETTINGS,
704                 ARG_CHDIR,
705                 ARG_PIVOT_ROOT,
706                 ARG_PRIVATE_USERS_CHOWN,
707                 ARG_PRIVATE_USERS_OWNERSHIP,
708                 ARG_NOTIFY_READY,
709                 ARG_ROOT_HASH,
710                 ARG_ROOT_HASH_SIG,
711                 ARG_VERITY_DATA,
712                 ARG_SYSTEM_CALL_FILTER,
713                 ARG_RLIMIT,
714                 ARG_HOSTNAME,
715                 ARG_NO_NEW_PRIVILEGES,
716                 ARG_OOM_SCORE_ADJUST,
717                 ARG_CPU_AFFINITY,
718                 ARG_RESOLV_CONF,
719                 ARG_TIMEZONE,
720                 ARG_CONSOLE,
721                 ARG_PIPE,
722                 ARG_OCI_BUNDLE,
723                 ARG_NO_PAGER,
724                 ARG_SET_CREDENTIAL,
725                 ARG_LOAD_CREDENTIAL,
726                 ARG_BIND_USER,
727                 ARG_SUPPRESS_SYNC,
728         };
729 
730         static const struct option options[] = {
731                 { "help",                   no_argument,       NULL, 'h'                        },
732                 { "version",                no_argument,       NULL, ARG_VERSION                },
733                 { "directory",              required_argument, NULL, 'D'                        },
734                 { "template",               required_argument, NULL, ARG_TEMPLATE               },
735                 { "ephemeral",              no_argument,       NULL, 'x'                        },
736                 { "user",                   required_argument, NULL, 'u'                        },
737                 { "private-network",        no_argument,       NULL, ARG_PRIVATE_NETWORK        },
738                 { "as-pid2",                no_argument,       NULL, 'a'                        },
739                 { "boot",                   no_argument,       NULL, 'b'                        },
740                 { "uuid",                   required_argument, NULL, ARG_UUID                   },
741                 { "read-only",              no_argument,       NULL, ARG_READ_ONLY              },
742                 { "capability",             required_argument, NULL, ARG_CAPABILITY             },
743                 { "ambient-capability",     required_argument, NULL, ARG_AMBIENT_CAPABILITY     },
744                 { "drop-capability",        required_argument, NULL, ARG_DROP_CAPABILITY        },
745                 { "no-new-privileges",      required_argument, NULL, ARG_NO_NEW_PRIVILEGES      },
746                 { "link-journal",           required_argument, NULL, ARG_LINK_JOURNAL           },
747                 { "bind",                   required_argument, NULL, ARG_BIND                   },
748                 { "bind-ro",                required_argument, NULL, ARG_BIND_RO                },
749                 { "tmpfs",                  required_argument, NULL, ARG_TMPFS                  },
750                 { "overlay",                required_argument, NULL, ARG_OVERLAY                },
751                 { "overlay-ro",             required_argument, NULL, ARG_OVERLAY_RO             },
752                 { "inaccessible",           required_argument, NULL, ARG_INACCESSIBLE           },
753                 { "machine",                required_argument, NULL, 'M'                        },
754                 { "hostname",               required_argument, NULL, ARG_HOSTNAME               },
755                 { "slice",                  required_argument, NULL, 'S'                        },
756                 { "setenv",                 required_argument, NULL, 'E'                        },
757                 { "selinux-context",        required_argument, NULL, 'Z'                        },
758                 { "selinux-apifs-context",  required_argument, NULL, 'L'                        },
759                 { "quiet",                  no_argument,       NULL, 'q'                        },
760                 { "share-system",           no_argument,       NULL, ARG_SHARE_SYSTEM           }, /* not documented */
761                 { "register",               required_argument, NULL, ARG_REGISTER               },
762                 { "keep-unit",              no_argument,       NULL, ARG_KEEP_UNIT              },
763                 { "network-interface",      required_argument, NULL, ARG_NETWORK_INTERFACE      },
764                 { "network-macvlan",        required_argument, NULL, ARG_NETWORK_MACVLAN        },
765                 { "network-ipvlan",         required_argument, NULL, ARG_NETWORK_IPVLAN         },
766                 { "network-veth",           no_argument,       NULL, 'n'                        },
767                 { "network-veth-extra",     required_argument, NULL, ARG_NETWORK_VETH_EXTRA     },
768                 { "network-bridge",         required_argument, NULL, ARG_NETWORK_BRIDGE         },
769                 { "network-zone",           required_argument, NULL, ARG_NETWORK_ZONE           },
770                 { "network-namespace-path", required_argument, NULL, ARG_NETWORK_NAMESPACE_PATH },
771                 { "personality",            required_argument, NULL, ARG_PERSONALITY            },
772                 { "image",                  required_argument, NULL, 'i'                        },
773                 { "volatile",               optional_argument, NULL, ARG_VOLATILE               },
774                 { "port",                   required_argument, NULL, 'p'                        },
775                 { "property",               required_argument, NULL, ARG_PROPERTY               },
776                 { "private-users",          optional_argument, NULL, ARG_PRIVATE_USERS          },
777                 { "private-users-chown",    optional_argument, NULL, ARG_PRIVATE_USERS_CHOWN    }, /* obsolete */
778                 { "private-users-ownership",required_argument, NULL, ARG_PRIVATE_USERS_OWNERSHIP},
779                 { "kill-signal",            required_argument, NULL, ARG_KILL_SIGNAL            },
780                 { "settings",               required_argument, NULL, ARG_SETTINGS               },
781                 { "chdir",                  required_argument, NULL, ARG_CHDIR                  },
782                 { "pivot-root",             required_argument, NULL, ARG_PIVOT_ROOT             },
783                 { "notify-ready",           required_argument, NULL, ARG_NOTIFY_READY           },
784                 { "root-hash",              required_argument, NULL, ARG_ROOT_HASH              },
785                 { "root-hash-sig",          required_argument, NULL, ARG_ROOT_HASH_SIG          },
786                 { "verity-data",            required_argument, NULL, ARG_VERITY_DATA            },
787                 { "system-call-filter",     required_argument, NULL, ARG_SYSTEM_CALL_FILTER     },
788                 { "rlimit",                 required_argument, NULL, ARG_RLIMIT                 },
789                 { "oom-score-adjust",       required_argument, NULL, ARG_OOM_SCORE_ADJUST       },
790                 { "cpu-affinity",           required_argument, NULL, ARG_CPU_AFFINITY           },
791                 { "resolv-conf",            required_argument, NULL, ARG_RESOLV_CONF            },
792                 { "timezone",               required_argument, NULL, ARG_TIMEZONE               },
793                 { "console",                required_argument, NULL, ARG_CONSOLE                },
794                 { "pipe",                   no_argument,       NULL, ARG_PIPE                   },
795                 { "oci-bundle",             required_argument, NULL, ARG_OCI_BUNDLE             },
796                 { "no-pager",               no_argument,       NULL, ARG_NO_PAGER               },
797                 { "set-credential",         required_argument, NULL, ARG_SET_CREDENTIAL         },
798                 { "load-credential",        required_argument, NULL, ARG_LOAD_CREDENTIAL        },
799                 { "bind-user",              required_argument, NULL, ARG_BIND_USER              },
800                 { "suppress-sync",          required_argument, NULL, ARG_SUPPRESS_SYNC          },
801                 {}
802         };
803 
804         int c, r;
805         uint64_t plus = 0, minus = 0;
806         bool mask_all_settings = false, mask_no_settings = false;
807 
808         assert(argc >= 0);
809         assert(argv);
810 
811         while ((c = getopt_long(argc, argv, "+hD:u:abL:M:jS:Z:qi:xp:nUE:P", options, NULL)) >= 0)
812                 switch (c) {
813 
814                 case 'h':
815                         return help();
816 
817                 case ARG_VERSION:
818                         return version();
819 
820                 case 'D':
821                         r = parse_path_argument(optarg, false, &arg_directory);
822                         if (r < 0)
823                                 return r;
824 
825                         arg_settings_mask |= SETTING_DIRECTORY;
826                         break;
827 
828                 case ARG_TEMPLATE:
829                         r = parse_path_argument(optarg, false, &arg_template);
830                         if (r < 0)
831                                 return r;
832 
833                         arg_settings_mask |= SETTING_DIRECTORY;
834                         break;
835 
836                 case 'i':
837                         r = parse_path_argument(optarg, false, &arg_image);
838                         if (r < 0)
839                                 return r;
840 
841                         arg_settings_mask |= SETTING_DIRECTORY;
842                         break;
843 
844                 case ARG_OCI_BUNDLE:
845                         r = parse_path_argument(optarg, false, &arg_oci_bundle);
846                         if (r < 0)
847                                 return r;
848 
849                         break;
850 
851                 case 'x':
852                         arg_ephemeral = true;
853                         arg_settings_mask |= SETTING_EPHEMERAL;
854                         break;
855 
856                 case 'u':
857                         r = free_and_strdup(&arg_user, optarg);
858                         if (r < 0)
859                                 return log_oom();
860 
861                         arg_settings_mask |= SETTING_USER;
862                         break;
863 
864                 case ARG_NETWORK_ZONE: {
865                         char *j;
866 
867                         j = strjoin("vz-", optarg);
868                         if (!j)
869                                 return log_oom();
870 
871                         if (!ifname_valid(j)) {
872                                 log_error("Network zone name not valid: %s", j);
873                                 free(j);
874                                 return -EINVAL;
875                         }
876 
877                         free_and_replace(arg_network_zone, j);
878 
879                         arg_network_veth = true;
880                         arg_private_network = true;
881                         arg_settings_mask |= SETTING_NETWORK;
882                         break;
883                 }
884 
885                 case ARG_NETWORK_BRIDGE:
886 
887                         if (!ifname_valid(optarg))
888                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
889                                                        "Bridge interface name not valid: %s", optarg);
890 
891                         r = free_and_strdup(&arg_network_bridge, optarg);
892                         if (r < 0)
893                                 return log_oom();
894 
895                         _fallthrough_;
896                 case 'n':
897                         arg_network_veth = true;
898                         arg_private_network = true;
899                         arg_settings_mask |= SETTING_NETWORK;
900                         break;
901 
902                 case ARG_NETWORK_VETH_EXTRA:
903                         r = veth_extra_parse(&arg_network_veth_extra, optarg);
904                         if (r < 0)
905                                 return log_error_errno(r, "Failed to parse --network-veth-extra= parameter: %s", optarg);
906 
907                         arg_private_network = true;
908                         arg_settings_mask |= SETTING_NETWORK;
909                         break;
910 
911                 case ARG_NETWORK_INTERFACE:
912                         if (!ifname_valid(optarg))
913                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
914                                                        "Network interface name not valid: %s", optarg);
915 
916                         r = test_network_interface_initialized(optarg);
917                         if (r < 0)
918                                 return r;
919 
920                         if (strv_extend(&arg_network_interfaces, optarg) < 0)
921                                 return log_oom();
922 
923                         arg_private_network = true;
924                         arg_settings_mask |= SETTING_NETWORK;
925                         break;
926 
927                 case ARG_NETWORK_MACVLAN:
928 
929                         if (!ifname_valid(optarg))
930                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
931                                                        "MACVLAN network interface name not valid: %s", optarg);
932 
933                         r = test_network_interface_initialized(optarg);
934                         if (r < 0)
935                                 return r;
936 
937                         if (strv_extend(&arg_network_macvlan, optarg) < 0)
938                                 return log_oom();
939 
940                         arg_private_network = true;
941                         arg_settings_mask |= SETTING_NETWORK;
942                         break;
943 
944                 case ARG_NETWORK_IPVLAN:
945 
946                         if (!ifname_valid(optarg))
947                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
948                                                        "IPVLAN network interface name not valid: %s", optarg);
949 
950                         r = test_network_interface_initialized(optarg);
951                         if (r < 0)
952                                 return r;
953 
954                         if (strv_extend(&arg_network_ipvlan, optarg) < 0)
955                                 return log_oom();
956 
957                         _fallthrough_;
958                 case ARG_PRIVATE_NETWORK:
959                         arg_private_network = true;
960                         arg_settings_mask |= SETTING_NETWORK;
961                         break;
962 
963                 case ARG_NETWORK_NAMESPACE_PATH:
964                         r = parse_path_argument(optarg, false, &arg_network_namespace_path);
965                         if (r < 0)
966                                 return r;
967 
968                         arg_settings_mask |= SETTING_NETWORK;
969                         break;
970 
971                 case 'b':
972                         if (arg_start_mode == START_PID2)
973                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
974                                                        "--boot and --as-pid2 may not be combined.");
975 
976                         arg_start_mode = START_BOOT;
977                         arg_settings_mask |= SETTING_START_MODE;
978                         break;
979 
980                 case 'a':
981                         if (arg_start_mode == START_BOOT)
982                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
983                                                        "--boot and --as-pid2 may not be combined.");
984 
985                         arg_start_mode = START_PID2;
986                         arg_settings_mask |= SETTING_START_MODE;
987                         break;
988 
989                 case ARG_UUID:
990                         r = sd_id128_from_string(optarg, &arg_uuid);
991                         if (r < 0)
992                                 return log_error_errno(r, "Invalid UUID: %s", optarg);
993 
994                         if (sd_id128_is_null(arg_uuid))
995                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
996                                                        "Machine UUID may not be all zeroes.");
997 
998                         arg_settings_mask |= SETTING_MACHINE_ID;
999                         break;
1000 
1001                 case 'S': {
1002                         _cleanup_free_ char *mangled = NULL;
1003 
1004                         r = unit_name_mangle_with_suffix(optarg, NULL, UNIT_NAME_MANGLE_WARN, ".slice", &mangled);
1005                         if (r < 0)
1006                                 return log_oom();
1007 
1008                         free_and_replace(arg_slice, mangled);
1009                         arg_settings_mask |= SETTING_SLICE;
1010                         break;
1011                 }
1012 
1013                 case 'M':
1014                         if (isempty(optarg))
1015                                 arg_machine = mfree(arg_machine);
1016                         else {
1017                                 if (!hostname_is_valid(optarg, 0))
1018                                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1019                                                                "Invalid machine name: %s", optarg);
1020 
1021                                 r = free_and_strdup(&arg_machine, optarg);
1022                                 if (r < 0)
1023                                         return log_oom();
1024                         }
1025                         break;
1026 
1027                 case ARG_HOSTNAME:
1028                         if (isempty(optarg))
1029                                 arg_hostname = mfree(arg_hostname);
1030                         else {
1031                                 if (!hostname_is_valid(optarg, 0))
1032                                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1033                                                                "Invalid hostname: %s", optarg);
1034 
1035                                 r = free_and_strdup(&arg_hostname, optarg);
1036                                 if (r < 0)
1037                                         return log_oom();
1038                         }
1039 
1040                         arg_settings_mask |= SETTING_HOSTNAME;
1041                         break;
1042 
1043                 case 'Z':
1044                         arg_selinux_context = optarg;
1045                         break;
1046 
1047                 case 'L':
1048                         arg_selinux_apifs_context = optarg;
1049                         break;
1050 
1051                 case ARG_READ_ONLY:
1052                         arg_read_only = true;
1053                         arg_settings_mask |= SETTING_READ_ONLY;
1054                         break;
1055 
1056                 case ARG_AMBIENT_CAPABILITY: {
1057                         uint64_t m;
1058                         r = parse_capability_spec(optarg, &m);
1059                         if (r <= 0)
1060                                 return r;
1061                         arg_caps_ambient |= m;
1062                         arg_settings_mask |= SETTING_CAPABILITY;
1063                         break;
1064                 }
1065                 case ARG_CAPABILITY:
1066                 case ARG_DROP_CAPABILITY: {
1067                         uint64_t m;
1068                         r = parse_capability_spec(optarg, &m);
1069                         if (r <= 0)
1070                                 return r;
1071 
1072                         if (c == ARG_CAPABILITY)
1073                                 plus |= m;
1074                         else
1075                                 minus |= m;
1076                         arg_settings_mask |= SETTING_CAPABILITY;
1077                         break;
1078                 }
1079                 case ARG_NO_NEW_PRIVILEGES:
1080                         r = parse_boolean(optarg);
1081                         if (r < 0)
1082                                 return log_error_errno(r, "Failed to parse --no-new-privileges= argument: %s", optarg);
1083 
1084                         arg_no_new_privileges = r;
1085                         arg_settings_mask |= SETTING_NO_NEW_PRIVILEGES;
1086                         break;
1087 
1088                 case 'j':
1089                         arg_link_journal = LINK_GUEST;
1090                         arg_link_journal_try = true;
1091                         arg_settings_mask |= SETTING_LINK_JOURNAL;
1092                         break;
1093 
1094                 case ARG_LINK_JOURNAL:
1095                         r = parse_link_journal(optarg, &arg_link_journal, &arg_link_journal_try);
1096                         if (r < 0)
1097                                 return log_error_errno(r, "Failed to parse link journal mode %s", optarg);
1098 
1099                         arg_settings_mask |= SETTING_LINK_JOURNAL;
1100                         break;
1101 
1102                 case ARG_BIND:
1103                 case ARG_BIND_RO:
1104                         r = bind_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_BIND_RO);
1105                         if (r < 0)
1106                                 return log_error_errno(r, "Failed to parse --bind(-ro)= argument %s: %m", optarg);
1107 
1108                         arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1109                         break;
1110 
1111                 case ARG_TMPFS:
1112                         r = tmpfs_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1113                         if (r < 0)
1114                                 return log_error_errno(r, "Failed to parse --tmpfs= argument %s: %m", optarg);
1115 
1116                         arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1117                         break;
1118 
1119                 case ARG_OVERLAY:
1120                 case ARG_OVERLAY_RO:
1121                         r = overlay_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg, c == ARG_OVERLAY_RO);
1122                         if (r == -EADDRNOTAVAIL)
1123                                 return log_error_errno(r, "--overlay(-ro)= needs at least two colon-separated directories specified.");
1124                         if (r < 0)
1125                                 return log_error_errno(r, "Failed to parse --overlay(-ro)= argument %s: %m", optarg);
1126 
1127                         arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1128                         break;
1129 
1130                 case ARG_INACCESSIBLE:
1131                         r = inaccessible_mount_parse(&arg_custom_mounts, &arg_n_custom_mounts, optarg);
1132                         if (r < 0)
1133                                 return log_error_errno(r, "Failed to parse --inaccessible= argument %s: %m", optarg);
1134 
1135                         arg_settings_mask |= SETTING_CUSTOM_MOUNTS;
1136                         break;
1137 
1138                 case 'E':
1139                         r = strv_env_replace_strdup_passthrough(&arg_setenv, optarg);
1140                         if (r < 0)
1141                                 return log_error_errno(r, "Cannot assign environment variable %s: %m", optarg);
1142 
1143                         arg_settings_mask |= SETTING_ENVIRONMENT;
1144                         break;
1145 
1146                 case 'q':
1147                         arg_quiet = true;
1148                         break;
1149 
1150                 case ARG_SHARE_SYSTEM:
1151                         /* We don't officially support this anymore, except for compat reasons. People should use the
1152                          * $SYSTEMD_NSPAWN_SHARE_* environment variables instead. */
1153                         log_warning("Please do not use --share-system anymore, use $SYSTEMD_NSPAWN_SHARE_* instead.");
1154                         arg_clone_ns_flags = 0;
1155                         break;
1156 
1157                 case ARG_REGISTER:
1158                         r = parse_boolean(optarg);
1159                         if (r < 0) {
1160                                 log_error("Failed to parse --register= argument: %s", optarg);
1161                                 return r;
1162                         }
1163 
1164                         arg_register = r;
1165                         break;
1166 
1167                 case ARG_KEEP_UNIT:
1168                         arg_keep_unit = true;
1169                         break;
1170 
1171                 case ARG_PERSONALITY:
1172 
1173                         arg_personality = personality_from_string(optarg);
1174                         if (arg_personality == PERSONALITY_INVALID)
1175                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1176                                                        "Unknown or unsupported personality '%s'.", optarg);
1177 
1178                         arg_settings_mask |= SETTING_PERSONALITY;
1179                         break;
1180 
1181                 case ARG_VOLATILE:
1182 
1183                         if (!optarg)
1184                                 arg_volatile_mode = VOLATILE_YES;
1185                         else if (streq(optarg, "help")) {
1186                                 DUMP_STRING_TABLE(volatile_mode, VolatileMode, _VOLATILE_MODE_MAX);
1187                                 return 0;
1188                         } else {
1189                                 VolatileMode m;
1190 
1191                                 m = volatile_mode_from_string(optarg);
1192                                 if (m < 0)
1193                                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1194                                                                "Failed to parse --volatile= argument: %s", optarg);
1195                                 else
1196                                         arg_volatile_mode = m;
1197                         }
1198 
1199                         arg_settings_mask |= SETTING_VOLATILE_MODE;
1200                         break;
1201 
1202                 case 'p':
1203                         r = expose_port_parse(&arg_expose_ports, optarg);
1204                         if (r == -EEXIST)
1205                                 return log_error_errno(r, "Duplicate port specification: %s", optarg);
1206                         if (r < 0)
1207                                 return log_error_errno(r, "Failed to parse host port %s: %m", optarg);
1208 
1209                         arg_settings_mask |= SETTING_EXPOSE_PORTS;
1210                         break;
1211 
1212                 case ARG_PROPERTY:
1213                         if (strv_extend(&arg_property, optarg) < 0)
1214                                 return log_oom();
1215 
1216                         break;
1217 
1218                 case ARG_PRIVATE_USERS: {
1219                         int boolean;
1220 
1221                         if (!optarg)
1222                                 boolean = true;
1223                         else if (!in_charset(optarg, DIGITS))
1224                                 /* do *not* parse numbers as booleans */
1225                                 boolean = parse_boolean(optarg);
1226                         else
1227                                 boolean = -1;
1228 
1229                         if (boolean == 0) {
1230                                 /* no: User namespacing off */
1231                                 arg_userns_mode = USER_NAMESPACE_NO;
1232                                 arg_uid_shift = UID_INVALID;
1233                                 arg_uid_range = UINT32_C(0x10000);
1234                         } else if (boolean > 0) {
1235                                 /* yes: User namespacing on, UID range is read from root dir */
1236                                 arg_userns_mode = USER_NAMESPACE_FIXED;
1237                                 arg_uid_shift = UID_INVALID;
1238                                 arg_uid_range = UINT32_C(0x10000);
1239                         } else if (streq(optarg, "pick")) {
1240                                 /* pick: User namespacing on, UID range is picked randomly */
1241                                 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1242                                                                         * implied by USER_NAMESPACE_PICK
1243                                                                         * further down. */
1244                                 arg_uid_shift = UID_INVALID;
1245                                 arg_uid_range = UINT32_C(0x10000);
1246 
1247                         } else if (streq(optarg, "identity")) {
1248                                 /* identitiy: User namespaces on, UID range is map the 0…0xFFFF range to
1249                                  * itself, i.e. we don't actually map anything, but do take benefit of
1250                                  * isolation of capability sets. */
1251                                 arg_userns_mode = USER_NAMESPACE_FIXED;
1252                                 arg_uid_shift = 0;
1253                                 arg_uid_range = UINT32_C(0x10000);
1254                         } else {
1255                                 _cleanup_free_ char *buffer = NULL;
1256                                 const char *range, *shift;
1257 
1258                                 /* anything else: User namespacing on, UID range is explicitly configured */
1259 
1260                                 range = strchr(optarg, ':');
1261                                 if (range) {
1262                                         buffer = strndup(optarg, range - optarg);
1263                                         if (!buffer)
1264                                                 return log_oom();
1265                                         shift = buffer;
1266 
1267                                         range++;
1268                                         r = safe_atou32(range, &arg_uid_range);
1269                                         if (r < 0)
1270                                                 return log_error_errno(r, "Failed to parse UID range \"%s\": %m", range);
1271                                 } else
1272                                         shift = optarg;
1273 
1274                                 r = parse_uid(shift, &arg_uid_shift);
1275                                 if (r < 0)
1276                                         return log_error_errno(r, "Failed to parse UID \"%s\": %m", optarg);
1277 
1278                                 arg_userns_mode = USER_NAMESPACE_FIXED;
1279 
1280                                 if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
1281                                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID range cannot be empty or go beyond " UID_FMT ".", UID_INVALID);
1282                         }
1283 
1284                         arg_settings_mask |= SETTING_USERNS;
1285                         break;
1286                 }
1287 
1288                 case 'U':
1289                         if (userns_supported()) {
1290                                 arg_userns_mode = USER_NAMESPACE_PICK; /* Note that arg_userns_ownership is
1291                                                                         * implied by USER_NAMESPACE_PICK
1292                                                                         * further down. */
1293                                 arg_uid_shift = UID_INVALID;
1294                                 arg_uid_range = UINT32_C(0x10000);
1295 
1296                                 arg_settings_mask |= SETTING_USERNS;
1297                         }
1298 
1299                         break;
1300 
1301                 case ARG_PRIVATE_USERS_CHOWN:
1302                         arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
1303 
1304                         arg_settings_mask |= SETTING_USERNS;
1305                         break;
1306 
1307                 case ARG_PRIVATE_USERS_OWNERSHIP:
1308                         if (streq(optarg, "help")) {
1309                                 DUMP_STRING_TABLE(user_namespace_ownership, UserNamespaceOwnership, _USER_NAMESPACE_OWNERSHIP_MAX);
1310                                 return 0;
1311                         }
1312 
1313                         arg_userns_ownership = user_namespace_ownership_from_string(optarg);
1314                         if (arg_userns_ownership < 0)
1315                                 return log_error_errno(arg_userns_ownership, "Cannot parse --user-namespace-ownership= value: %s", optarg);
1316 
1317                         arg_settings_mask |= SETTING_USERNS;
1318                         break;
1319 
1320                 case ARG_KILL_SIGNAL:
1321                         if (streq(optarg, "help")) {
1322                                 DUMP_STRING_TABLE(signal, int, _NSIG);
1323                                 return 0;
1324                         }
1325 
1326                         arg_kill_signal = signal_from_string(optarg);
1327                         if (arg_kill_signal < 0)
1328                                 return log_error_errno(arg_kill_signal, "Cannot parse signal: %s", optarg);
1329 
1330                         arg_settings_mask |= SETTING_KILL_SIGNAL;
1331                         break;
1332 
1333                 case ARG_SETTINGS:
1334 
1335                         /* no               → do not read files
1336                          * yes              → read files, do not override cmdline, trust only subset
1337                          * override         → read files, override cmdline, trust only subset
1338                          * trusted          → read files, do not override cmdline, trust all
1339                          */
1340 
1341                         r = parse_boolean(optarg);
1342                         if (r < 0) {
1343                                 if (streq(optarg, "trusted")) {
1344                                         mask_all_settings = false;
1345                                         mask_no_settings = false;
1346                                         arg_settings_trusted = true;
1347 
1348                                 } else if (streq(optarg, "override")) {
1349                                         mask_all_settings = false;
1350                                         mask_no_settings = true;
1351                                         arg_settings_trusted = -1;
1352                                 } else
1353                                         return log_error_errno(r, "Failed to parse --settings= argument: %s", optarg);
1354                         } else if (r > 0) {
1355                                 /* yes */
1356                                 mask_all_settings = false;
1357                                 mask_no_settings = false;
1358                                 arg_settings_trusted = -1;
1359                         } else {
1360                                 /* no */
1361                                 mask_all_settings = true;
1362                                 mask_no_settings = false;
1363                                 arg_settings_trusted = false;
1364                         }
1365 
1366                         break;
1367 
1368                 case ARG_CHDIR:
1369                         if (!path_is_absolute(optarg))
1370                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1371                                                        "Working directory %s is not an absolute path.", optarg);
1372 
1373                         r = free_and_strdup(&arg_chdir, optarg);
1374                         if (r < 0)
1375                                 return log_oom();
1376 
1377                         arg_settings_mask |= SETTING_WORKING_DIRECTORY;
1378                         break;
1379 
1380                 case ARG_PIVOT_ROOT:
1381                         r = pivot_root_parse(&arg_pivot_root_new, &arg_pivot_root_old, optarg);
1382                         if (r < 0)
1383                                 return log_error_errno(r, "Failed to parse --pivot-root= argument %s: %m", optarg);
1384 
1385                         arg_settings_mask |= SETTING_PIVOT_ROOT;
1386                         break;
1387 
1388                 case ARG_NOTIFY_READY:
1389                         r = parse_boolean(optarg);
1390                         if (r < 0)
1391                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1392                                                        "%s is not a valid notify mode. Valid modes are: yes, no, and ready.", optarg);
1393                         arg_notify_ready = r;
1394                         arg_settings_mask |= SETTING_NOTIFY_READY;
1395                         break;
1396 
1397                 case ARG_ROOT_HASH: {
1398                         _cleanup_free_ void *k = NULL;
1399                         size_t l;
1400 
1401                         r = unhexmem(optarg, strlen(optarg), &k, &l);
1402                         if (r < 0)
1403                                 return log_error_errno(r, "Failed to parse root hash: %s", optarg);
1404                         if (l < sizeof(sd_id128_t))
1405                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Root hash must be at least 128bit long: %s", optarg);
1406 
1407                         free_and_replace(arg_verity_settings.root_hash, k);
1408                         arg_verity_settings.root_hash_size = l;
1409                         break;
1410                 }
1411 
1412                 case ARG_ROOT_HASH_SIG: {
1413                         char *value;
1414                         size_t l;
1415                         void *p;
1416 
1417                         if ((value = startswith(optarg, "base64:"))) {
1418                                 r = unbase64mem(value, strlen(value), &p, &l);
1419                                 if (r < 0)
1420                                         return log_error_errno(r, "Failed to parse root hash signature '%s': %m", optarg);
1421 
1422                         } else {
1423                                 r = read_full_file(optarg, (char**) &p, &l);
1424                                 if (r < 0)
1425                                         return log_error_errno(r, "Failed parse root hash signature file '%s': %m", optarg);
1426                         }
1427 
1428                         free_and_replace(arg_verity_settings.root_hash_sig, p);
1429                         arg_verity_settings.root_hash_sig_size = l;
1430                         break;
1431                 }
1432 
1433                 case ARG_VERITY_DATA:
1434                         r = parse_path_argument(optarg, false, &arg_verity_settings.data_path);
1435                         if (r < 0)
1436                                 return r;
1437                         break;
1438 
1439                 case ARG_SYSTEM_CALL_FILTER: {
1440                         bool negative;
1441                         const char *items;
1442 
1443                         negative = optarg[0] == '~';
1444                         items = negative ? optarg + 1 : optarg;
1445 
1446                         for (;;) {
1447                                 _cleanup_free_ char *word = NULL;
1448 
1449                                 r = extract_first_word(&items, &word, NULL, 0);
1450                                 if (r == 0)
1451                                         break;
1452                                 if (r == -ENOMEM)
1453                                         return log_oom();
1454                                 if (r < 0)
1455                                         return log_error_errno(r, "Failed to parse system call filter: %m");
1456 
1457                                 if (negative)
1458                                         r = strv_extend(&arg_syscall_deny_list, word);
1459                                 else
1460                                         r = strv_extend(&arg_syscall_allow_list, word);
1461                                 if (r < 0)
1462                                         return log_oom();
1463                         }
1464 
1465                         arg_settings_mask |= SETTING_SYSCALL_FILTER;
1466                         break;
1467                 }
1468 
1469                 case ARG_RLIMIT: {
1470                         const char *eq;
1471                         _cleanup_free_ char *name = NULL;
1472                         int rl;
1473 
1474                         if (streq(optarg, "help")) {
1475                                 DUMP_STRING_TABLE(rlimit, int, _RLIMIT_MAX);
1476                                 return 0;
1477                         }
1478 
1479                         eq = strchr(optarg, '=');
1480                         if (!eq)
1481                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1482                                                        "--rlimit= expects an '=' assignment.");
1483 
1484                         name = strndup(optarg, eq - optarg);
1485                         if (!name)
1486                                 return log_oom();
1487 
1488                         rl = rlimit_from_string_harder(name);
1489                         if (rl < 0)
1490                                 return log_error_errno(rl, "Unknown resource limit: %s", name);
1491 
1492                         if (!arg_rlimit[rl]) {
1493                                 arg_rlimit[rl] = new0(struct rlimit, 1);
1494                                 if (!arg_rlimit[rl])
1495                                         return log_oom();
1496                         }
1497 
1498                         r = rlimit_parse(rl, eq + 1, arg_rlimit[rl]);
1499                         if (r < 0)
1500                                 return log_error_errno(r, "Failed to parse resource limit: %s", eq + 1);
1501 
1502                         arg_settings_mask |= SETTING_RLIMIT_FIRST << rl;
1503                         break;
1504                 }
1505 
1506                 case ARG_OOM_SCORE_ADJUST:
1507                         r = parse_oom_score_adjust(optarg, &arg_oom_score_adjust);
1508                         if (r < 0)
1509                                 return log_error_errno(r, "Failed to parse --oom-score-adjust= parameter: %s", optarg);
1510 
1511                         arg_oom_score_adjust_set = true;
1512                         arg_settings_mask |= SETTING_OOM_SCORE_ADJUST;
1513                         break;
1514 
1515                 case ARG_CPU_AFFINITY: {
1516                         CPUSet cpuset;
1517 
1518                         r = parse_cpu_set(optarg, &cpuset);
1519                         if (r < 0)
1520                                 return log_error_errno(r, "Failed to parse CPU affinity mask %s: %m", optarg);
1521 
1522                         cpu_set_reset(&arg_cpu_set);
1523                         arg_cpu_set = cpuset;
1524                         arg_settings_mask |= SETTING_CPU_AFFINITY;
1525                         break;
1526                 }
1527 
1528                 case ARG_RESOLV_CONF:
1529                         if (streq(optarg, "help")) {
1530                                 DUMP_STRING_TABLE(resolv_conf_mode, ResolvConfMode, _RESOLV_CONF_MODE_MAX);
1531                                 return 0;
1532                         }
1533 
1534                         arg_resolv_conf = resolv_conf_mode_from_string(optarg);
1535                         if (arg_resolv_conf < 0)
1536                                 return log_error_errno(arg_resolv_conf,
1537                                                        "Failed to parse /etc/resolv.conf mode: %s", optarg);
1538 
1539                         arg_settings_mask |= SETTING_RESOLV_CONF;
1540                         break;
1541 
1542                 case ARG_TIMEZONE:
1543                         if (streq(optarg, "help")) {
1544                                 DUMP_STRING_TABLE(timezone_mode, TimezoneMode, _TIMEZONE_MODE_MAX);
1545                                 return 0;
1546                         }
1547 
1548                         arg_timezone = timezone_mode_from_string(optarg);
1549                         if (arg_timezone < 0)
1550                                 return log_error_errno(arg_timezone,
1551                                                        "Failed to parse /etc/localtime mode: %s", optarg);
1552 
1553                         arg_settings_mask |= SETTING_TIMEZONE;
1554                         break;
1555 
1556                 case ARG_CONSOLE:
1557                         r = handle_arg_console(optarg);
1558                         if (r <= 0)
1559                                 return r;
1560                         break;
1561 
1562                 case 'P':
1563                 case ARG_PIPE:
1564                         r = handle_arg_console("pipe");
1565                         if (r <= 0)
1566                                 return r;
1567                         break;
1568 
1569                 case ARG_NO_PAGER:
1570                         arg_pager_flags |= PAGER_DISABLE;
1571                         break;
1572 
1573                 case ARG_SET_CREDENTIAL: {
1574                         _cleanup_free_ char *word = NULL, *data = NULL;
1575                         const char *p = optarg;
1576                         Credential *a;
1577                         ssize_t l;
1578 
1579                         r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1580                         if (r == -ENOMEM)
1581                                 return log_oom();
1582                         if (r < 0)
1583                                 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1584                         if (r == 0 || !p)
1585                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1586 
1587                         if (!credential_name_valid(word))
1588                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1589 
1590                         for (size_t i = 0; i < arg_n_credentials; i++)
1591                                 if (streq(arg_credentials[i].id, word))
1592                                         return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1593 
1594                         l = cunescape(p, UNESCAPE_ACCEPT_NUL, &data);
1595                         if (l < 0)
1596                                 return log_error_errno(l, "Failed to unescape credential data: %s", p);
1597 
1598                         a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1599                         if (!a)
1600                                 return log_oom();
1601 
1602                         a[arg_n_credentials++] = (Credential) {
1603                                 .id = TAKE_PTR(word),
1604                                 .data = TAKE_PTR(data),
1605                                 .size = l,
1606                         };
1607 
1608                         arg_credentials = a;
1609 
1610                         arg_settings_mask |= SETTING_CREDENTIALS;
1611                         break;
1612                 }
1613 
1614                 case ARG_LOAD_CREDENTIAL: {
1615                         ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
1616                         _cleanup_(erase_and_freep) char *data = NULL;
1617                         _cleanup_free_ char *word = NULL, *j = NULL;
1618                         const char *p = optarg;
1619                         Credential *a;
1620                         size_t size, i;
1621 
1622                         r = extract_first_word(&p, &word, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1623                         if (r == -ENOMEM)
1624                                 return log_oom();
1625                         if (r < 0)
1626                                 return log_error_errno(r, "Failed to parse --set-credential= parameter: %m");
1627                         if (r == 0 || !p)
1628                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Missing value for --set-credential=: %s", optarg);
1629 
1630                         if (!credential_name_valid(word))
1631                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Credential name is not valid: %s", word);
1632 
1633                         for (i = 0; i < arg_n_credentials; i++)
1634                                 if (streq(arg_credentials[i].id, word))
1635                                         return log_error_errno(SYNTHETIC_ERRNO(EEXIST), "Duplicate credential '%s', refusing.", word);
1636 
1637                         if (path_is_absolute(p))
1638                                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
1639                         else {
1640                                 const char *e;
1641 
1642                                 r = get_credentials_dir(&e);
1643                                 if (r < 0)
1644                                         return log_error_errno(r, "Credential not available (no credentials passed at all): %s", word);
1645 
1646                                 j = path_join(e, p);
1647                                 if (!j)
1648                                         return log_oom();
1649                         }
1650 
1651                         r = read_full_file_full(AT_FDCWD, j ?: p, UINT64_MAX, SIZE_MAX,
1652                                                 flags,
1653                                                 NULL,
1654                                                 &data, &size);
1655                         if (r < 0)
1656                                 return log_error_errno(r, "Failed to read credential '%s': %m", j ?: p);
1657 
1658                         a = reallocarray(arg_credentials, arg_n_credentials + 1, sizeof(Credential));
1659                         if (!a)
1660                                 return log_oom();
1661 
1662                         a[arg_n_credentials++] = (Credential) {
1663                                 .id = TAKE_PTR(word),
1664                                 .data = TAKE_PTR(data),
1665                                 .size = size,
1666                         };
1667 
1668                         arg_credentials = a;
1669 
1670                         arg_settings_mask |= SETTING_CREDENTIALS;
1671                         break;
1672                 }
1673 
1674                 case ARG_BIND_USER:
1675                         if (!valid_user_group_name(optarg, 0))
1676                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid user name to bind: %s", optarg);
1677 
1678                         if (strv_extend(&arg_bind_user, optarg) < 0)
1679                                 return log_oom();
1680 
1681                         arg_settings_mask |= SETTING_BIND_USER;
1682                         break;
1683 
1684                 case ARG_SUPPRESS_SYNC:
1685                         r = parse_boolean_argument("--suppress-sync=", optarg, &arg_suppress_sync);
1686                         if (r < 0)
1687                                 return r;
1688 
1689                         arg_settings_mask |= SETTING_SUPPRESS_SYNC;
1690                         break;
1691 
1692                 case '?':
1693                         return -EINVAL;
1694 
1695                 default:
1696                         assert_not_reached();
1697                 }
1698 
1699         if (argc > optind) {
1700                 strv_free(arg_parameters);
1701                 arg_parameters = strv_copy(argv + optind);
1702                 if (!arg_parameters)
1703                         return log_oom();
1704 
1705                 arg_settings_mask |= SETTING_START_MODE;
1706         }
1707 
1708         if (arg_ephemeral && arg_template && !arg_directory)
1709                 /* User asked for ephemeral execution but specified --template= instead of --directory=. Semantically
1710                  * such an invocation makes some sense, see https://github.com/systemd/systemd/issues/3667. Let's
1711                  * accept this here, and silently make "--ephemeral --template=" equivalent to "--ephemeral
1712                  * --directory=". */
1713                 arg_directory = TAKE_PTR(arg_template);
1714 
1715         arg_caps_retain = (arg_caps_retain | plus | (arg_private_network ? UINT64_C(1) << CAP_NET_ADMIN : 0)) & ~minus;
1716 
1717         /* Make sure to parse environment before we reset the settings mask below */
1718         r = parse_environment();
1719         if (r < 0)
1720                 return r;
1721 
1722         /* Load all settings from .nspawn files */
1723         if (mask_no_settings)
1724                 arg_settings_mask = 0;
1725 
1726         /* Don't load any settings from .nspawn files */
1727         if (mask_all_settings)
1728                 arg_settings_mask = _SETTINGS_MASK_ALL;
1729 
1730         return 1;
1731 }
1732 
verify_arguments(void)1733 static int verify_arguments(void) {
1734         int r;
1735 
1736         if (arg_start_mode == START_PID2 && arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
1737                 /* If we are running the stub init in the container, we don't need to look at what the init
1738                  * in the container supports, because we are not using it. Let's immediately pick the right
1739                  * setting based on the host system configuration.
1740                  *
1741                  * We only do this, if the user didn't use an environment variable to override the detection.
1742                  */
1743 
1744                 r = cg_all_unified();
1745                 if (r < 0)
1746                         return log_error_errno(r, "Failed to determine whether we are in all unified mode.");
1747                 if (r > 0)
1748                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_ALL;
1749                 else if (cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) > 0)
1750                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_SYSTEMD;
1751                 else
1752                         arg_unified_cgroup_hierarchy = CGROUP_UNIFIED_NONE;
1753         }
1754 
1755         if (arg_userns_mode != USER_NAMESPACE_NO)
1756                 arg_mount_settings |= MOUNT_USE_USERNS;
1757 
1758         if (arg_private_network)
1759                 arg_mount_settings |= MOUNT_APPLY_APIVFS_NETNS;
1760 
1761         if (!(arg_clone_ns_flags & CLONE_NEWPID) ||
1762             !(arg_clone_ns_flags & CLONE_NEWUTS)) {
1763                 arg_register = false;
1764                 if (arg_start_mode != START_PID1)
1765                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--boot cannot be used without namespacing.");
1766         }
1767 
1768         if (arg_userns_ownership < 0)
1769                 arg_userns_ownership =
1770                         arg_userns_mode == USER_NAMESPACE_PICK ? USER_NAMESPACE_OWNERSHIP_AUTO :
1771                                                                  USER_NAMESPACE_OWNERSHIP_OFF;
1772 
1773         if (arg_start_mode == START_BOOT && arg_kill_signal <= 0)
1774                 arg_kill_signal = SIGRTMIN+3;
1775 
1776         if (arg_volatile_mode != VOLATILE_NO) /* Make sure all file systems contained in the image are mounted read-only if we are in volatile mode */
1777                 arg_read_only = true;
1778 
1779         if (has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts))
1780                 arg_read_only = true;
1781 
1782         if (arg_keep_unit && arg_register && cg_pid_get_owner_uid(0, NULL) >= 0)
1783                 /* Save the user from accidentally registering either user-$SESSION.scope or user@.service.
1784                  * The latter is not technically a user session, but we don't need to labour the point. */
1785                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--keep-unit --register=yes may not be used when invoked from a user session.");
1786 
1787         if (arg_directory && arg_image)
1788                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--directory= and --image= may not be combined.");
1789 
1790         if (arg_template && arg_image)
1791                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= and --image= may not be combined.");
1792 
1793         if (arg_template && !(arg_directory || arg_machine))
1794                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--template= needs --directory= or --machine=.");
1795 
1796         if (arg_ephemeral && arg_template)
1797                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --template= may not be combined.");
1798 
1799         if (arg_ephemeral && !IN_SET(arg_link_journal, LINK_NO, LINK_AUTO))
1800                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--ephemeral and --link-journal= may not be combined.");
1801 
1802         if (arg_userns_mode != USER_NAMESPACE_NO && !userns_supported())
1803                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "--private-users= is not supported, kernel compiled without user namespace support.");
1804 
1805         if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_read_only)
1806                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1807                                        "--read-only and --private-users-ownership=chown may not be combined.");
1808 
1809         /* We don't support --private-users-ownership=chown together with any of the volatile modes since we
1810          * couldn't change the read-only part of the tree (i.e. /usr) anyway, or because it would trigger a
1811          * massive copy-up (in case of overlay) making the entire exercise pointless. */
1812         if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_CHOWN && arg_volatile_mode != VOLATILE_NO)
1813                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--volatile= and --private-users-ownership=chown may not be combined.");
1814 
1815         /* If --network-namespace-path is given with any other network-related option (except --private-network),
1816          * we need to error out, to avoid conflicts between different network options. */
1817         if (arg_network_namespace_path &&
1818                 (arg_network_interfaces || arg_network_macvlan ||
1819                  arg_network_ipvlan || arg_network_veth_extra ||
1820                  arg_network_bridge || arg_network_zone ||
1821                  arg_network_veth))
1822                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--network-namespace-path= cannot be combined with other network options.");
1823 
1824         if (arg_network_bridge && arg_network_zone)
1825                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1826                                        "--network-bridge= and --network-zone= may not be combined.");
1827 
1828         if (arg_userns_mode != USER_NAMESPACE_NO && (arg_mount_settings & MOUNT_APPLY_APIVFS_NETNS) && !arg_private_network)
1829                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid namespacing settings. Mounting sysfs with --private-users requires --private-network.");
1830 
1831         if (arg_userns_mode != USER_NAMESPACE_NO && !(arg_mount_settings & MOUNT_APPLY_APIVFS_RO))
1832                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot combine --private-users with read-write mounts.");
1833 
1834         if (arg_expose_ports && !arg_private_network)
1835                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Cannot use --port= without private networking.");
1836 
1837         if (arg_caps_ambient) {
1838                 if (arg_caps_ambient == UINT64_MAX)
1839                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= does not support the value all.");
1840 
1841                 if ((arg_caps_ambient & arg_caps_retain) != arg_caps_ambient)
1842                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not fully covered by Capability= setting.");
1843 
1844                 if (arg_start_mode == START_BOOT)
1845                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "AmbientCapability= setting is not useful for boot mode.");
1846         }
1847 
1848         if (arg_userns_mode == USER_NAMESPACE_NO && !strv_isempty(arg_bind_user))
1849                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "--bind-user= requires --private-users");
1850 
1851         /* Drop duplicate --bind-user= entries */
1852         strv_uniq(arg_bind_user);
1853 
1854         r = custom_mount_check_all();
1855         if (r < 0)
1856                 return r;
1857 
1858         return 0;
1859 }
1860 
userns_lchown(const char * p,uid_t uid,gid_t gid)1861 int userns_lchown(const char *p, uid_t uid, gid_t gid) {
1862         assert(p);
1863 
1864         if (arg_userns_mode == USER_NAMESPACE_NO)
1865                 return 0;
1866 
1867         if (uid == UID_INVALID && gid == GID_INVALID)
1868                 return 0;
1869 
1870         if (uid != UID_INVALID) {
1871                 uid += arg_uid_shift;
1872 
1873                 if (uid < arg_uid_shift || uid >= arg_uid_shift + arg_uid_range)
1874                         return -EOVERFLOW;
1875         }
1876 
1877         if (gid != GID_INVALID) {
1878                 gid += (gid_t) arg_uid_shift;
1879 
1880                 if (gid < (gid_t) arg_uid_shift || gid >= (gid_t) (arg_uid_shift + arg_uid_range))
1881                         return -EOVERFLOW;
1882         }
1883 
1884         return RET_NERRNO(lchown(p, uid, gid));
1885 }
1886 
userns_mkdir(const char * root,const char * path,mode_t mode,uid_t uid,gid_t gid)1887 int userns_mkdir(const char *root, const char *path, mode_t mode, uid_t uid, gid_t gid) {
1888         const char *q;
1889         int r;
1890 
1891         q = prefix_roota(root, path);
1892         r = RET_NERRNO(mkdir(q, mode));
1893         if (r == -EEXIST)
1894                 return 0;
1895         if (r < 0)
1896                 return r;
1897 
1898         return userns_lchown(q, uid, gid);
1899 }
1900 
timezone_from_path(const char * path)1901 static const char *timezone_from_path(const char *path) {
1902         return PATH_STARTSWITH_SET(
1903                         path,
1904                         "../usr/share/zoneinfo/",
1905                         "/usr/share/zoneinfo/");
1906 }
1907 
etc_writable(void)1908 static bool etc_writable(void) {
1909         return !arg_read_only || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_OVERLAY);
1910 }
1911 
setup_timezone(const char * dest)1912 static int setup_timezone(const char *dest) {
1913         _cleanup_free_ char *p = NULL, *etc = NULL;
1914         const char *where, *check;
1915         TimezoneMode m;
1916         int r;
1917 
1918         assert(dest);
1919 
1920         if (IN_SET(arg_timezone, TIMEZONE_AUTO, TIMEZONE_SYMLINK)) {
1921                 r = readlink_malloc("/etc/localtime", &p);
1922                 if (r == -ENOENT && arg_timezone == TIMEZONE_AUTO)
1923                         m = etc_writable() ? TIMEZONE_DELETE : TIMEZONE_OFF;
1924                 else if (r == -EINVAL && arg_timezone == TIMEZONE_AUTO) /* regular file? */
1925                         m = etc_writable() ? TIMEZONE_COPY : TIMEZONE_BIND;
1926                 else if (r < 0) {
1927                         log_warning_errno(r, "Failed to read host's /etc/localtime symlink, not updating container timezone: %m");
1928                         /* To handle warning, delete /etc/localtime and replace it with a symbolic link to a time zone data
1929                          * file.
1930                          *
1931                          * Example:
1932                          * ln -s /usr/share/zoneinfo/UTC /etc/localtime
1933                          */
1934                         return 0;
1935                 } else if (arg_timezone == TIMEZONE_AUTO)
1936                         m = etc_writable() ? TIMEZONE_SYMLINK : TIMEZONE_BIND;
1937                 else
1938                         m = arg_timezone;
1939         } else
1940                 m = arg_timezone;
1941 
1942         if (m == TIMEZONE_OFF)
1943                 return 0;
1944 
1945         r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
1946         if (r < 0) {
1947                 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
1948                 return 0;
1949         }
1950 
1951         where = strjoina(etc, "/localtime");
1952 
1953         switch (m) {
1954 
1955         case TIMEZONE_DELETE:
1956                 if (unlink(where) < 0)
1957                         log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
1958 
1959                 return 0;
1960 
1961         case TIMEZONE_SYMLINK: {
1962                 _cleanup_free_ char *q = NULL;
1963                 const char *z, *what;
1964 
1965                 z = timezone_from_path(p);
1966                 if (!z) {
1967                         log_warning("/etc/localtime does not point into /usr/share/zoneinfo/, not updating container timezone.");
1968                         return 0;
1969                 }
1970 
1971                 r = readlink_malloc(where, &q);
1972                 if (r >= 0 && streq_ptr(timezone_from_path(q), z))
1973                         return 0; /* Already pointing to the right place? Then do nothing .. */
1974 
1975                 check = strjoina(dest, "/usr/share/zoneinfo/", z);
1976                 r = chase_symlinks(check, dest, 0, NULL, NULL);
1977                 if (r < 0)
1978                         log_debug_errno(r, "Timezone %s does not exist (or is not accessible) in container, not creating symlink: %m", z);
1979                 else {
1980                         if (unlink(where) < 0 && errno != ENOENT) {
1981                                 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING, /* Don't complain on read-only images */
1982                                                errno, "Failed to remove existing timezone info %s in container, ignoring: %m", where);
1983                                 return 0;
1984                         }
1985 
1986                         what = strjoina("../usr/share/zoneinfo/", z);
1987                         if (symlink(what, where) < 0) {
1988                                 log_full_errno(IN_SET(errno, EROFS, EACCES, EPERM) ? LOG_DEBUG : LOG_WARNING,
1989                                                errno, "Failed to correct timezone of container, ignoring: %m");
1990                                 return 0;
1991                         }
1992 
1993                         break;
1994                 }
1995 
1996                 _fallthrough_;
1997         }
1998 
1999         case TIMEZONE_BIND: {
2000                 _cleanup_free_ char *resolved = NULL;
2001                 int found;
2002 
2003                 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2004                 if (found < 0) {
2005                         log_warning_errno(found, "Failed to resolve /etc/localtime path in container, ignoring: %m");
2006                         return 0;
2007                 }
2008 
2009                 if (found == 0) /* missing? */
2010                         (void) touch(resolved);
2011 
2012                 r = mount_nofollow_verbose(LOG_WARNING, "/etc/localtime", resolved, NULL, MS_BIND, NULL);
2013                 if (r >= 0)
2014                         return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2015 
2016                 _fallthrough_;
2017         }
2018 
2019         case TIMEZONE_COPY:
2020                 /* If mounting failed, try to copy */
2021                 r = copy_file_atomic("/etc/localtime", where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2022                 if (r < 0) {
2023                         log_full_errno(IN_SET(r, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2024                                        "Failed to copy /etc/localtime to %s, ignoring: %m", where);
2025                         return 0;
2026                 }
2027 
2028                 break;
2029 
2030         default:
2031                 assert_not_reached();
2032         }
2033 
2034         /* Fix permissions of the symlink or file copy we just created */
2035         r = userns_lchown(where, 0, 0);
2036         if (r < 0)
2037                 log_warning_errno(r, "Failed to chown /etc/localtime, ignoring: %m");
2038 
2039         return 0;
2040 }
2041 
have_resolv_conf(const char * path)2042 static int have_resolv_conf(const char *path) {
2043         assert(path);
2044 
2045         if (access(path, F_OK) < 0) {
2046                 if (errno == ENOENT)
2047                         return 0;
2048 
2049                 return log_debug_errno(errno, "Failed to determine whether '%s' is available: %m", path);
2050         }
2051 
2052         return 1;
2053 }
2054 
resolved_listening(void)2055 static int resolved_listening(void) {
2056         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2057         _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
2058         _cleanup_free_ char *dns_stub_listener_mode = NULL;
2059         int r;
2060 
2061         /* Check if resolved is listening */
2062 
2063         r = sd_bus_open_system(&bus);
2064         if (r < 0)
2065                 return log_debug_errno(r, "Failed to open system bus: %m");
2066 
2067         r = bus_name_has_owner(bus, "org.freedesktop.resolve1", NULL);
2068         if (r < 0)
2069                 return log_debug_errno(r, "Failed to check whether the 'org.freedesktop.resolve1' bus name is taken: %m");
2070         if (r == 0)
2071                 return 0;
2072 
2073         r = sd_bus_get_property_string(bus,
2074                                        "org.freedesktop.resolve1",
2075                                        "/org/freedesktop/resolve1",
2076                                        "org.freedesktop.resolve1.Manager",
2077                                        "DNSStubListener",
2078                                        &error,
2079                                        &dns_stub_listener_mode);
2080         if (r < 0)
2081                 return log_debug_errno(r, "Failed to query DNSStubListener property: %s", bus_error_message(&error, r));
2082 
2083         return STR_IN_SET(dns_stub_listener_mode, "udp", "yes");
2084 }
2085 
setup_resolv_conf(const char * dest)2086 static int setup_resolv_conf(const char *dest) {
2087         _cleanup_free_ char *etc = NULL;
2088         const char *where, *what;
2089         ResolvConfMode m;
2090         int r;
2091 
2092         assert(dest);
2093 
2094         if (arg_resolv_conf == RESOLV_CONF_AUTO) {
2095                 if (arg_private_network)
2096                         m = RESOLV_CONF_OFF;
2097                 else if (have_resolv_conf(PRIVATE_STUB_RESOLV_CONF) > 0 && resolved_listening() > 0)
2098                         m = etc_writable() ? RESOLV_CONF_COPY_STUB : RESOLV_CONF_BIND_STUB;
2099                 else if (have_resolv_conf("/etc/resolv.conf") > 0)
2100                         m = etc_writable() ? RESOLV_CONF_COPY_HOST : RESOLV_CONF_BIND_HOST;
2101                 else
2102                         m = etc_writable() ? RESOLV_CONF_DELETE : RESOLV_CONF_OFF;
2103 
2104         } else
2105                 m = arg_resolv_conf;
2106 
2107         if (m == RESOLV_CONF_OFF)
2108                 return 0;
2109 
2110         r = chase_symlinks("/etc", dest, CHASE_PREFIX_ROOT, &etc, NULL);
2111         if (r < 0) {
2112                 log_warning_errno(r, "Failed to resolve /etc path in container, ignoring: %m");
2113                 return 0;
2114         }
2115 
2116         where = strjoina(etc, "/resolv.conf");
2117 
2118         if (m == RESOLV_CONF_DELETE) {
2119                 if (unlink(where) < 0)
2120                         log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno, "Failed to remove '%s', ignoring: %m", where);
2121 
2122                 return 0;
2123         }
2124 
2125         if (IN_SET(m, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_COPY_STATIC))
2126                 what = PRIVATE_STATIC_RESOLV_CONF;
2127         else if (IN_SET(m, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_COPY_UPLINK))
2128                 what = PRIVATE_UPLINK_RESOLV_CONF;
2129         else if (IN_SET(m, RESOLV_CONF_BIND_STUB, RESOLV_CONF_REPLACE_STUB, RESOLV_CONF_COPY_STUB))
2130                 what = PRIVATE_STUB_RESOLV_CONF;
2131         else
2132                 what = "/etc/resolv.conf";
2133 
2134         if (IN_SET(m, RESOLV_CONF_BIND_HOST, RESOLV_CONF_BIND_STATIC, RESOLV_CONF_BIND_UPLINK, RESOLV_CONF_BIND_STUB)) {
2135                 _cleanup_free_ char *resolved = NULL;
2136                 int found;
2137 
2138                 found = chase_symlinks(where, dest, CHASE_NONEXISTENT, &resolved, NULL);
2139                 if (found < 0) {
2140                         log_warning_errno(found, "Failed to resolve /etc/resolv.conf path in container, ignoring: %m");
2141                         return 0;
2142                 }
2143 
2144                 if (found == 0) /* missing? */
2145                         (void) touch(resolved);
2146 
2147                 r = mount_nofollow_verbose(LOG_WARNING, what, resolved, NULL, MS_BIND, NULL);
2148                 if (r >= 0)
2149                         return mount_nofollow_verbose(LOG_ERR, NULL, resolved, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NODEV, NULL);
2150 
2151                 /* If that didn't work, let's copy the file */
2152         }
2153 
2154         if (IN_SET(m, RESOLV_CONF_REPLACE_HOST, RESOLV_CONF_REPLACE_STATIC, RESOLV_CONF_REPLACE_UPLINK, RESOLV_CONF_REPLACE_STUB))
2155                 r = copy_file_atomic(what, where, 0644, 0, 0, COPY_REFLINK|COPY_REPLACE);
2156         else
2157                 r = copy_file(what, where, O_TRUNC|O_NOFOLLOW, 0644, 0, 0, COPY_REFLINK);
2158         if (r < 0) {
2159                 /* If the file already exists as symlink, let's suppress the warning, under the assumption that
2160                  * resolved or something similar runs inside and the symlink points there.
2161                  *
2162                  * If the disk image is read-only, there's also no point in complaining.
2163                  */
2164                 log_full_errno(!IN_SET(RESOLV_CONF_COPY_HOST, RESOLV_CONF_COPY_STATIC, RESOLV_CONF_COPY_UPLINK, RESOLV_CONF_COPY_STUB) &&
2165                                IN_SET(r, -ELOOP, -EROFS, -EACCES, -EPERM) ? LOG_DEBUG : LOG_WARNING, r,
2166                                "Failed to copy /etc/resolv.conf to %s, ignoring: %m", where);
2167                 return 0;
2168         }
2169 
2170         r = userns_lchown(where, 0, 0);
2171         if (r < 0)
2172                 log_warning_errno(r, "Failed to chown /etc/resolv.conf, ignoring: %m");
2173 
2174         return 0;
2175 }
2176 
setup_boot_id(void)2177 static int setup_boot_id(void) {
2178         _cleanup_(unlink_and_freep) char *from = NULL;
2179         _cleanup_free_ char *path = NULL;
2180         sd_id128_t rnd = SD_ID128_NULL;
2181         const char *to;
2182         int r;
2183 
2184         /* Generate a new randomized boot ID, so that each boot-up of the container gets a new one */
2185 
2186         r = tempfn_random_child("/run", "proc-sys-kernel-random-boot-id", &path);
2187         if (r < 0)
2188                 return log_error_errno(r, "Failed to generate random boot ID path: %m");
2189 
2190         r = sd_id128_randomize(&rnd);
2191         if (r < 0)
2192                 return log_error_errno(r, "Failed to generate random boot id: %m");
2193 
2194         r = id128_write(path, ID128_UUID, rnd, false);
2195         if (r < 0)
2196                 return log_error_errno(r, "Failed to write boot id: %m");
2197 
2198         from = TAKE_PTR(path);
2199         to = "/proc/sys/kernel/random/boot_id";
2200 
2201         r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
2202         if (r < 0)
2203                 return r;
2204 
2205         return mount_nofollow_verbose(LOG_ERR, NULL, to, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2206 }
2207 
copy_devnodes(const char * dest)2208 static int copy_devnodes(const char *dest) {
2209         static const char devnodes[] =
2210                 "null\0"
2211                 "zero\0"
2212                 "full\0"
2213                 "random\0"
2214                 "urandom\0"
2215                 "tty\0"
2216                 "net/tun\0";
2217 
2218         const char *d;
2219         int r = 0;
2220 
2221         assert(dest);
2222 
2223         BLOCK_WITH_UMASK(0000);
2224 
2225         /* Create /dev/net, so that we can create /dev/net/tun in it */
2226         if (userns_mkdir(dest, "/dev/net", 0755, 0, 0) < 0)
2227                 return log_error_errno(r, "Failed to create /dev/net directory: %m");
2228 
2229         NULSTR_FOREACH(d, devnodes) {
2230                 _cleanup_free_ char *from = NULL, *to = NULL;
2231                 struct stat st;
2232 
2233                 from = path_join("/dev/", d);
2234                 if (!from)
2235                         return log_oom();
2236 
2237                 to = path_join(dest, from);
2238                 if (!to)
2239                         return log_oom();
2240 
2241                 if (stat(from, &st) < 0) {
2242 
2243                         if (errno != ENOENT)
2244                                 return log_error_errno(errno, "Failed to stat %s: %m", from);
2245 
2246                 } else if (!S_ISCHR(st.st_mode) && !S_ISBLK(st.st_mode))
2247                         return log_error_errno(SYNTHETIC_ERRNO(EIO),
2248                                                "%s is not a char or block device, cannot copy.", from);
2249                 else {
2250                         _cleanup_free_ char *sl = NULL, *prefixed = NULL, *dn = NULL, *t = NULL;
2251 
2252                         if (mknod(to, st.st_mode, st.st_rdev) < 0) {
2253                                 /* Explicitly warn the user when /dev is already populated. */
2254                                 if (errno == EEXIST)
2255                                         log_notice("%s/dev is pre-mounted and pre-populated. If a pre-mounted /dev is provided it needs to be an unpopulated file system.", dest);
2256                                 if (errno != EPERM)
2257                                         return log_error_errno(errno, "mknod(%s) failed: %m", to);
2258 
2259                                 /* Some systems abusively restrict mknod but allow bind mounts. */
2260                                 r = touch(to);
2261                                 if (r < 0)
2262                                         return log_error_errno(r, "touch (%s) failed: %m", to);
2263                                 r = mount_nofollow_verbose(LOG_DEBUG, from, to, NULL, MS_BIND, NULL);
2264                                 if (r < 0)
2265                                         return log_error_errno(r, "Both mknod and bind mount (%s) failed: %m", to);
2266                         }
2267 
2268                         r = userns_lchown(to, 0, 0);
2269                         if (r < 0)
2270                                 return log_error_errno(r, "chown() of device node %s failed: %m", to);
2271 
2272                         dn = path_join("/dev", S_ISCHR(st.st_mode) ? "char" : "block");
2273                         if (!dn)
2274                                 return log_oom();
2275 
2276                         r = userns_mkdir(dest, dn, 0755, 0, 0);
2277                         if (r < 0)
2278                                 return log_error_errno(r, "Failed to create '%s': %m", dn);
2279 
2280                         if (asprintf(&sl, "%s/%u:%u", dn, major(st.st_rdev), minor(st.st_rdev)) < 0)
2281                                 return log_oom();
2282 
2283                         prefixed = path_join(dest, sl);
2284                         if (!prefixed)
2285                                 return log_oom();
2286 
2287                         t = path_join("..", d);
2288                         if (!t)
2289                                 return log_oom();
2290 
2291                         if (symlink(t, prefixed) < 0)
2292                                 log_debug_errno(errno, "Failed to symlink '%s' to '%s': %m", t, prefixed);
2293                 }
2294         }
2295 
2296         return r;
2297 }
2298 
make_extra_nodes(const char * dest)2299 static int make_extra_nodes(const char *dest) {
2300         size_t i;
2301         int r;
2302 
2303         BLOCK_WITH_UMASK(0000);
2304 
2305         for (i = 0; i < arg_n_extra_nodes; i++) {
2306                 _cleanup_free_ char *path = NULL;
2307                 DeviceNode *n = arg_extra_nodes + i;
2308 
2309                 path = path_join(dest, n->path);
2310                 if (!path)
2311                         return log_oom();
2312 
2313                 if (mknod(path, n->mode, S_ISCHR(n->mode) || S_ISBLK(n->mode) ? makedev(n->major, n->minor) : 0) < 0)
2314                         return log_error_errno(errno, "Failed to create device node '%s': %m", path);
2315 
2316                 r = chmod_and_chown(path, n->mode, n->uid, n->gid);
2317                 if (r < 0)
2318                         return log_error_errno(r, "Failed to adjust device node ownership of '%s': %m", path);
2319         }
2320 
2321         return 0;
2322 }
2323 
setup_pts(const char * dest)2324 static int setup_pts(const char *dest) {
2325         _cleanup_free_ char *options = NULL;
2326         const char *p;
2327         int r;
2328 
2329 #if HAVE_SELINUX
2330         if (arg_selinux_apifs_context)
2331                 (void) asprintf(&options,
2332                                 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT ",context=\"%s\"",
2333                                 arg_uid_shift + TTY_GID,
2334                                 arg_selinux_apifs_context);
2335         else
2336 #endif
2337                 (void) asprintf(&options,
2338                                 "newinstance,ptmxmode=0666,mode=620,gid=" GID_FMT,
2339                                 arg_uid_shift + TTY_GID);
2340 
2341         if (!options)
2342                 return log_oom();
2343 
2344         /* Mount /dev/pts itself */
2345         p = prefix_roota(dest, "/dev/pts");
2346         r = RET_NERRNO(mkdir(p, 0755));
2347         if (r < 0)
2348                 return log_error_errno(r, "Failed to create /dev/pts: %m");
2349 
2350         r = mount_nofollow_verbose(LOG_ERR, "devpts", p, "devpts", MS_NOSUID|MS_NOEXEC, options);
2351         if (r < 0)
2352                 return r;
2353         r = userns_lchown(p, 0, 0);
2354         if (r < 0)
2355                 return log_error_errno(r, "Failed to chown /dev/pts: %m");
2356 
2357         /* Create /dev/ptmx symlink */
2358         p = prefix_roota(dest, "/dev/ptmx");
2359         if (symlink("pts/ptmx", p) < 0)
2360                 return log_error_errno(errno, "Failed to create /dev/ptmx symlink: %m");
2361         r = userns_lchown(p, 0, 0);
2362         if (r < 0)
2363                 return log_error_errno(r, "Failed to chown /dev/ptmx: %m");
2364 
2365         /* And fix /dev/pts/ptmx ownership */
2366         p = prefix_roota(dest, "/dev/pts/ptmx");
2367         r = userns_lchown(p, 0, 0);
2368         if (r < 0)
2369                 return log_error_errno(r, "Failed to chown /dev/pts/ptmx: %m");
2370 
2371         return 0;
2372 }
2373 
setup_stdio_as_dev_console(void)2374 static int setup_stdio_as_dev_console(void) {
2375         _cleanup_close_ int terminal = -1;
2376         int r;
2377 
2378         /* We open the TTY in O_NOCTTY mode, so that we do not become controller yet. We'll do that later
2379          * explicitly, if we are configured to. */
2380         terminal = open_terminal("/dev/console", O_RDWR|O_NOCTTY);
2381         if (terminal < 0)
2382                 return log_error_errno(terminal, "Failed to open console: %m");
2383 
2384         /* Make sure we can continue logging to the original stderr, even if
2385          * stderr points elsewhere now */
2386         r = log_dup_console();
2387         if (r < 0)
2388                 return log_error_errno(r, "Failed to duplicate stderr: %m");
2389 
2390         /* invalidates 'terminal' on success and failure */
2391         r = rearrange_stdio(terminal, terminal, terminal);
2392         TAKE_FD(terminal);
2393         if (r < 0)
2394                 return log_error_errno(r, "Failed to move console to stdin/stdout/stderr: %m");
2395 
2396         return 0;
2397 }
2398 
setup_dev_console(const char * console)2399 static int setup_dev_console(const char *console) {
2400         _cleanup_free_ char *p = NULL;
2401         int r;
2402 
2403         /* Create /dev/console symlink */
2404         r = path_make_relative("/dev", console, &p);
2405         if (r < 0)
2406                 return log_error_errno(r, "Failed to create relative path: %m");
2407 
2408         if (symlink(p, "/dev/console") < 0)
2409                 return log_error_errno(errno, "Failed to create /dev/console symlink: %m");
2410 
2411         return 0;
2412 }
2413 
setup_keyring(void)2414 static int setup_keyring(void) {
2415         key_serial_t keyring;
2416 
2417         /* Allocate a new session keyring for the container. This makes sure the keyring of the session
2418          * systemd-nspawn was invoked from doesn't leak into the container. Note that by default we block
2419          * keyctl() and request_key() anyway via seccomp so doing this operation isn't strictly necessary,
2420          * but in case people explicitly allow-list these system calls let's make sure we don't leak anything
2421          * into the container. */
2422 
2423         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2424         if (keyring == -1) {
2425                 if (errno == ENOSYS)
2426                         log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2427                 else if (ERRNO_IS_PRIVILEGE(errno))
2428                         log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2429                 else
2430                         return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2431         }
2432 
2433         return 0;
2434 }
2435 
setup_credentials(const char * root)2436 static int setup_credentials(const char *root) {
2437         const char *q;
2438         int r;
2439 
2440         if (arg_n_credentials <= 0)
2441                 return 0;
2442 
2443         r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2444         if (r < 0)
2445                 return log_error_errno(r, "Failed to create /run/host: %m");
2446 
2447         r = userns_mkdir(root, "/run/host/credentials", 0700, 0, 0);
2448         if (r < 0)
2449                 return log_error_errno(r, "Failed to create /run/host/credentials: %m");
2450 
2451         q = prefix_roota(root, "/run/host/credentials");
2452         r = mount_nofollow_verbose(LOG_ERR, NULL, q, "ramfs", MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0700");
2453         if (r < 0)
2454                 return r;
2455 
2456         for (size_t i = 0; i < arg_n_credentials; i++) {
2457                 _cleanup_free_ char *j = NULL;
2458                 _cleanup_close_ int fd = -1;
2459 
2460                 j = path_join(q, arg_credentials[i].id);
2461                 if (!j)
2462                         return log_oom();
2463 
2464                 fd = open(j, O_CREAT|O_EXCL|O_WRONLY|O_CLOEXEC|O_NOFOLLOW, 0600);
2465                 if (fd < 0)
2466                         return log_error_errno(errno, "Failed to create credential file %s: %m", j);
2467 
2468                 r = loop_write(fd, arg_credentials[i].data, arg_credentials[i].size, /* do_poll= */ false);
2469                 if (r < 0)
2470                         return log_error_errno(r, "Failed to write credential to file %s: %m", j);
2471 
2472                 if (fchmod(fd, 0400) < 0)
2473                         return log_error_errno(errno, "Failed to adjust access mode of %s: %m", j);
2474 
2475                 if (arg_userns_mode != USER_NAMESPACE_NO) {
2476                         if (fchown(fd, arg_uid_shift, arg_uid_shift) < 0)
2477                                 return log_error_errno(errno, "Failed to adjust ownership of %s: %m", j);
2478                 }
2479         }
2480 
2481         if (chmod(q, 0500) < 0)
2482                 return log_error_errno(errno, "Failed to adjust access mode of %s: %m", q);
2483 
2484         r = userns_lchown(q, 0, 0);
2485         if (r < 0)
2486                 return r;
2487 
2488         /* Make both mount and superblock read-only now */
2489         r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2490         if (r < 0)
2491                 return r;
2492 
2493         return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_REMOUNT|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV, "mode=0500");
2494 }
2495 
setup_kmsg(int kmsg_socket)2496 static int setup_kmsg(int kmsg_socket) {
2497         _cleanup_(unlink_and_freep) char *from = NULL;
2498         _cleanup_free_ char *fifo = NULL;
2499         _cleanup_close_ int fd = -1;
2500         int r;
2501 
2502         assert(kmsg_socket >= 0);
2503 
2504         BLOCK_WITH_UMASK(0000);
2505 
2506         /* We create the kmsg FIFO as as temporary file in /run, but immediately delete it after bind mounting it to
2507          * /proc/kmsg. While FIFOs on the reading side behave very similar to /proc/kmsg, their writing side behaves
2508          * differently from /dev/kmsg in that writing blocks when nothing is reading. In order to avoid any problems
2509          * with containers deadlocking due to this we simply make /dev/kmsg unavailable to the container. */
2510 
2511         r = tempfn_random_child("/run", "proc-kmsg", &fifo);
2512         if (r < 0)
2513                 return log_error_errno(r, "Failed to generate kmsg path: %m");
2514 
2515         if (mkfifo(fifo, 0600) < 0)
2516                 return log_error_errno(errno, "mkfifo() for /run/kmsg failed: %m");
2517 
2518         from = TAKE_PTR(fifo);
2519 
2520         r = mount_nofollow_verbose(LOG_ERR, from, "/proc/kmsg", NULL, MS_BIND, NULL);
2521         if (r < 0)
2522                 return r;
2523 
2524         fd = open(from, O_RDWR|O_NONBLOCK|O_CLOEXEC);
2525         if (fd < 0)
2526                 return log_error_errno(errno, "Failed to open fifo: %m");
2527 
2528         /* Store away the fd in the socket, so that it stays open as long as we run the child */
2529         r = send_one_fd(kmsg_socket, fd, 0);
2530         if (r < 0)
2531                 return log_error_errno(r, "Failed to send FIFO fd: %m");
2532 
2533         return 0;
2534 }
2535 
2536 struct ExposeArgs {
2537         union in_addr_union address4;
2538         union in_addr_union address6;
2539         struct FirewallContext *fw_ctx;
2540 };
2541 
on_address_change(sd_netlink * rtnl,sd_netlink_message * m,void * userdata)2542 static int on_address_change(sd_netlink *rtnl, sd_netlink_message *m, void *userdata) {
2543         struct ExposeArgs *args = userdata;
2544 
2545         assert(rtnl);
2546         assert(m);
2547         assert(args);
2548 
2549         (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET, &args->address4);
2550         (void) expose_port_execute(rtnl, &args->fw_ctx, arg_expose_ports, AF_INET6, &args->address6);
2551         return 0;
2552 }
2553 
setup_hostname(void)2554 static int setup_hostname(void) {
2555         int r;
2556 
2557         if ((arg_clone_ns_flags & CLONE_NEWUTS) == 0)
2558                 return 0;
2559 
2560         r = sethostname_idempotent(arg_hostname ?: arg_machine);
2561         if (r < 0)
2562                 return log_error_errno(r, "Failed to set hostname: %m");
2563 
2564         return 0;
2565 }
2566 
setup_journal(const char * directory)2567 static int setup_journal(const char *directory) {
2568         _cleanup_free_ char *d = NULL;
2569         const char *p, *q;
2570         sd_id128_t this_id;
2571         bool try;
2572         int r;
2573 
2574         /* Don't link journals in ephemeral mode */
2575         if (arg_ephemeral)
2576                 return 0;
2577 
2578         if (arg_link_journal == LINK_NO)
2579                 return 0;
2580 
2581         try = arg_link_journal_try || arg_link_journal == LINK_AUTO;
2582 
2583         r = sd_id128_get_machine(&this_id);
2584         if (r < 0)
2585                 return log_error_errno(r, "Failed to retrieve machine ID: %m");
2586 
2587         if (sd_id128_equal(arg_uuid, this_id)) {
2588                 log_full(try ? LOG_WARNING : LOG_ERR,
2589                          "Host and machine ids are equal (%s): refusing to link journals", SD_ID128_TO_STRING(arg_uuid));
2590                 if (try)
2591                         return 0;
2592                 return -EEXIST;
2593         }
2594 
2595         FOREACH_STRING(dirname, "/var", "/var/log", "/var/log/journal") {
2596                 r = userns_mkdir(directory, dirname, 0755, 0, 0);
2597                 if (r < 0) {
2598                         bool ignore = r == -EROFS && try;
2599                         log_full_errno(ignore ? LOG_DEBUG : LOG_ERR, r,
2600                                        "Failed to create %s%s: %m", dirname, ignore ? ", ignoring" : "");
2601                         return ignore ? 0 : r;
2602                 }
2603         }
2604 
2605         p = strjoina("/var/log/journal/", SD_ID128_TO_STRING(arg_uuid));
2606         q = prefix_roota(directory, p);
2607 
2608         if (path_is_mount_point(p, NULL, 0) > 0) {
2609                 if (try)
2610                         return 0;
2611 
2612                 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2613                                        "%s: already a mount point, refusing to use for journal", p);
2614         }
2615 
2616         if (path_is_mount_point(q, NULL, 0) > 0) {
2617                 if (try)
2618                         return 0;
2619 
2620                 return log_error_errno(SYNTHETIC_ERRNO(EEXIST),
2621                                        "%s: already a mount point, refusing to use for journal", q);
2622         }
2623 
2624         r = readlink_and_make_absolute(p, &d);
2625         if (r >= 0) {
2626                 if (IN_SET(arg_link_journal, LINK_GUEST, LINK_AUTO) &&
2627                     path_equal(d, q)) {
2628 
2629                         r = userns_mkdir(directory, p, 0755, 0, 0);
2630                         if (r < 0)
2631                                 log_warning_errno(r, "Failed to create directory %s: %m", q);
2632                         return 0;
2633                 }
2634 
2635                 if (unlink(p) < 0)
2636                         return log_error_errno(errno, "Failed to remove symlink %s: %m", p);
2637         } else if (r == -EINVAL) {
2638 
2639                 if (arg_link_journal == LINK_GUEST &&
2640                     rmdir(p) < 0) {
2641 
2642                         if (errno == ENOTDIR) {
2643                                 log_error("%s already exists and is neither a symlink nor a directory", p);
2644                                 return r;
2645                         } else
2646                                 return log_error_errno(errno, "Failed to remove %s: %m", p);
2647                 }
2648         } else if (r != -ENOENT)
2649                 return log_error_errno(r, "readlink(%s) failed: %m", p);
2650 
2651         if (arg_link_journal == LINK_GUEST) {
2652 
2653                 if (symlink(q, p) < 0) {
2654                         if (try) {
2655                                 log_debug_errno(errno, "Failed to symlink %s to %s, skipping journal setup: %m", q, p);
2656                                 return 0;
2657                         } else
2658                                 return log_error_errno(errno, "Failed to symlink %s to %s: %m", q, p);
2659                 }
2660 
2661                 r = userns_mkdir(directory, p, 0755, 0, 0);
2662                 if (r < 0)
2663                         log_warning_errno(r, "Failed to create directory %s: %m", q);
2664                 return 0;
2665         }
2666 
2667         if (arg_link_journal == LINK_HOST) {
2668                 /* don't create parents here — if the host doesn't have
2669                  * permanent journal set up, don't force it here */
2670 
2671                 r = RET_NERRNO(mkdir(p, 0755));
2672                 if (r < 0 && r != -EEXIST) {
2673                         if (try) {
2674                                 log_debug_errno(r, "Failed to create %s, skipping journal setup: %m", p);
2675                                 return 0;
2676                         } else
2677                                 return log_error_errno(r, "Failed to create %s: %m", p);
2678                 }
2679 
2680         } else if (access(p, F_OK) < 0)
2681                 return 0;
2682 
2683         if (dir_is_empty(q, /* ignore_hidden_or_backup= */ false) == 0)
2684                 log_warning("%s is not empty, proceeding anyway.", q);
2685 
2686         r = userns_mkdir(directory, p, 0755, 0, 0);
2687         if (r < 0)
2688                 return log_error_errno(r, "Failed to create %s: %m", q);
2689 
2690         r = mount_nofollow_verbose(LOG_DEBUG, p, q, NULL, MS_BIND, NULL);
2691         if (r < 0)
2692                 return log_error_errno(errno, "Failed to bind mount journal from host into guest: %m");
2693 
2694         return 0;
2695 }
2696 
drop_capabilities(uid_t uid)2697 static int drop_capabilities(uid_t uid) {
2698         CapabilityQuintet q;
2699 
2700         /* Let's initialize all five capability sets to something valid. If the quintet was configured via
2701          * OCI use that, but fill in missing bits. If it wasn't then derive the quintet in full from
2702          * arg_caps_retain. */
2703 
2704         if (capability_quintet_is_set(&arg_full_capabilities)) {
2705                 q = arg_full_capabilities;
2706 
2707                 if (q.bounding == UINT64_MAX)
2708                         q.bounding = uid == 0 ? arg_caps_retain : 0;
2709 
2710                 if (q.effective == UINT64_MAX)
2711                         q.effective = uid == 0 ? q.bounding : 0;
2712 
2713                 if (q.inheritable == UINT64_MAX)
2714                         q.inheritable = uid == 0 ? q.bounding : arg_caps_ambient;
2715 
2716                 if (q.permitted == UINT64_MAX)
2717                         q.permitted = uid == 0 ? q.bounding : arg_caps_ambient;
2718 
2719                 if (q.ambient == UINT64_MAX && ambient_capabilities_supported())
2720                         q.ambient = arg_caps_ambient;
2721 
2722                 if (capability_quintet_mangle(&q))
2723                         return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Cannot set capabilities that are not in the current bounding set.");
2724 
2725         } else {
2726                 q = (CapabilityQuintet) {
2727                         .bounding = arg_caps_retain,
2728                         .effective = uid == 0 ? arg_caps_retain : 0,
2729                         .inheritable = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2730                         .permitted = uid == 0 ? arg_caps_retain : arg_caps_ambient,
2731                         .ambient = ambient_capabilities_supported() ? arg_caps_ambient : UINT64_MAX,
2732                 };
2733 
2734                 /* If we're not using OCI, proceed with mangled capabilities (so we don't error out)
2735                  * in order to maintain the same behavior as systemd < 242. */
2736                 if (capability_quintet_mangle(&q))
2737                         log_full(arg_quiet ? LOG_DEBUG : LOG_WARNING,
2738                                  "Some capabilities will not be set because they are not in the current bounding set.");
2739 
2740         }
2741 
2742         return capability_quintet_enforce(&q);
2743 }
2744 
reset_audit_loginuid(void)2745 static int reset_audit_loginuid(void) {
2746         _cleanup_free_ char *p = NULL;
2747         int r;
2748 
2749         if ((arg_clone_ns_flags & CLONE_NEWPID) == 0)
2750                 return 0;
2751 
2752         r = read_one_line_file("/proc/self/loginuid", &p);
2753         if (r == -ENOENT)
2754                 return 0;
2755         if (r < 0)
2756                 return log_error_errno(r, "Failed to read /proc/self/loginuid: %m");
2757 
2758         /* Already reset? */
2759         if (streq(p, "4294967295"))
2760                 return 0;
2761 
2762         r = write_string_file("/proc/self/loginuid", "4294967295", WRITE_STRING_FILE_DISABLE_BUFFER);
2763         if (r < 0) {
2764                 log_error_errno(r,
2765                                 "Failed to reset audit login UID. This probably means that your kernel is too\n"
2766                                 "old and you have audit enabled. Note that the auditing subsystem is known to\n"
2767                                 "be incompatible with containers on old kernels. Please make sure to upgrade\n"
2768                                 "your kernel or to off auditing with 'audit=0' on the kernel command line before\n"
2769                                 "using systemd-nspawn. Sleeping for 5s... (%m)");
2770 
2771                 sleep(5);
2772         }
2773 
2774         return 0;
2775 }
2776 
setup_propagate(const char * root)2777 static int setup_propagate(const char *root) {
2778         const char *p, *q;
2779         int r;
2780 
2781         (void) mkdir_p("/run/systemd/nspawn/", 0755);
2782         (void) mkdir_p("/run/systemd/nspawn/propagate", 0600);
2783         p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
2784         (void) mkdir_p(p, 0600);
2785 
2786         r = userns_mkdir(root, "/run/host", 0755, 0, 0);
2787         if (r < 0)
2788                 return log_error_errno(r, "Failed to create /run/host: %m");
2789 
2790         r = userns_mkdir(root, "/run/host/incoming", 0600, 0, 0);
2791         if (r < 0)
2792                 return log_error_errno(r, "Failed to create /run/host/incoming: %m");
2793 
2794         q = prefix_roota(root, "/run/host/incoming");
2795         r = mount_nofollow_verbose(LOG_ERR, p, q, NULL, MS_BIND, NULL);
2796         if (r < 0)
2797                 return r;
2798 
2799         r = mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
2800         if (r < 0)
2801                 return r;
2802 
2803         /* machined will MS_MOVE into that directory, and that's only supported for non-shared mounts. */
2804         return mount_nofollow_verbose(LOG_ERR, NULL, q, NULL, MS_SLAVE, NULL);
2805 }
2806 
setup_machine_id(const char * directory)2807 static int setup_machine_id(const char *directory) {
2808         const char *etc_machine_id;
2809         sd_id128_t id;
2810         int r;
2811 
2812         /* If the UUID in the container is already set, then that's what counts, and we use. If it isn't set, and the
2813          * caller passed --uuid=, then we'll pass it in the $container_uuid env var to PID 1 of the container. The
2814          * assumption is that PID 1 will then write it to /etc/machine-id to make it persistent. If --uuid= is not
2815          * passed we generate a random UUID, and pass it via $container_uuid. In effect this means that /etc/machine-id
2816          * in the container and our idea of the container UUID will always be in sync (at least if PID 1 in the
2817          * container behaves nicely). */
2818 
2819         etc_machine_id = prefix_roota(directory, "/etc/machine-id");
2820 
2821         r = id128_read(etc_machine_id, ID128_PLAIN_OR_UNINIT, &id);
2822         if (r < 0) {
2823                 if (!IN_SET(r, -ENOENT, -ENOMEDIUM)) /* If the file is missing or empty, we don't mind */
2824                         return log_error_errno(r, "Failed to read machine ID from container image: %m");
2825 
2826                 if (sd_id128_is_null(arg_uuid)) {
2827                         r = sd_id128_randomize(&arg_uuid);
2828                         if (r < 0)
2829                                 return log_error_errno(r, "Failed to acquire randomized machine UUID: %m");
2830                 }
2831         } else {
2832                 if (sd_id128_is_null(id))
2833                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2834                                                "Machine ID in container image is zero, refusing.");
2835 
2836                 arg_uuid = id;
2837         }
2838 
2839         return 0;
2840 }
2841 
recursive_chown(const char * directory,uid_t shift,uid_t range)2842 static int recursive_chown(const char *directory, uid_t shift, uid_t range) {
2843         int r;
2844 
2845         assert(directory);
2846 
2847         if (arg_userns_mode == USER_NAMESPACE_NO || arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_CHOWN)
2848                 return 0;
2849 
2850         r = path_patch_uid(directory, arg_uid_shift, arg_uid_range);
2851         if (r == -EOPNOTSUPP)
2852                 return log_error_errno(r, "Automatic UID/GID adjusting is only supported for UID/GID ranges starting at multiples of 2^16 with a range of 2^16.");
2853         if (r == -EBADE)
2854                 return log_error_errno(r, "Upper 16 bits of root directory UID and GID do not match.");
2855         if (r < 0)
2856                 return log_error_errno(r, "Failed to adjust UID/GID shift of OS tree: %m");
2857         if (r == 0)
2858                 log_debug("Root directory of image is already owned by the right UID/GID range, skipping recursive chown operation.");
2859         else
2860                 log_debug("Patched directory tree to match UID/GID range.");
2861 
2862         return r;
2863 }
2864 
2865 /*
2866  * Return values:
2867  * < 0 : wait_for_terminate() failed to get the state of the
2868  *       container, the container was terminated by a signal, or
2869  *       failed for an unknown reason.  No change is made to the
2870  *       container argument.
2871  * > 0 : The program executed in the container terminated with an
2872  *       error.  The exit code of the program executed in the
2873  *       container is returned.  The container argument has been set
2874  *       to CONTAINER_TERMINATED.
2875  *   0 : The container is being rebooted, has been shut down or exited
2876  *       successfully.  The container argument has been set to either
2877  *       CONTAINER_TERMINATED or CONTAINER_REBOOTED.
2878  *
2879  * That is, success is indicated by a return value of zero, and an
2880  * error is indicated by a non-zero value.
2881  */
wait_for_container(pid_t pid,ContainerStatus * container)2882 static int wait_for_container(pid_t pid, ContainerStatus *container) {
2883         siginfo_t status;
2884         int r;
2885 
2886         r = wait_for_terminate(pid, &status);
2887         if (r < 0)
2888                 return log_warning_errno(r, "Failed to wait for container: %m");
2889 
2890         switch (status.si_code) {
2891 
2892         case CLD_EXITED:
2893                 if (status.si_status == 0)
2894                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s exited successfully.", arg_machine);
2895                 else
2896                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s failed with error code %i.", arg_machine, status.si_status);
2897 
2898                 *container = CONTAINER_TERMINATED;
2899                 return status.si_status;
2900 
2901         case CLD_KILLED:
2902                 if (status.si_status == SIGINT) {
2903                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s has been shut down.", arg_machine);
2904                         *container = CONTAINER_TERMINATED;
2905                         return 0;
2906 
2907                 } else if (status.si_status == SIGHUP) {
2908                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO, "Container %s is being rebooted.", arg_machine);
2909                         *container = CONTAINER_REBOOTED;
2910                         return 0;
2911                 }
2912 
2913                 _fallthrough_;
2914         case CLD_DUMPED:
2915                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2916                                        "Container %s terminated by signal %s.", arg_machine, signal_to_string(status.si_status));
2917 
2918         default:
2919                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
2920                                        "Container %s failed due to unknown reason.", arg_machine);
2921         }
2922 }
2923 
on_orderly_shutdown(sd_event_source * s,const struct signalfd_siginfo * si,void * userdata)2924 static int on_orderly_shutdown(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
2925         pid_t pid;
2926 
2927         pid = PTR_TO_PID(userdata);
2928         if (pid > 0) {
2929                 if (kill(pid, arg_kill_signal) >= 0) {
2930                         log_info("Trying to halt container. Send SIGTERM again to trigger immediate termination.");
2931                         sd_event_source_set_userdata(s, NULL);
2932                         return 0;
2933                 }
2934         }
2935 
2936         sd_event_exit(sd_event_source_get_event(s), 0);
2937         return 0;
2938 }
2939 
on_sigchld(sd_event_source * s,const struct signalfd_siginfo * ssi,void * userdata)2940 static int on_sigchld(sd_event_source *s, const struct signalfd_siginfo *ssi, void *userdata) {
2941         pid_t pid;
2942 
2943         assert(s);
2944         assert(ssi);
2945 
2946         pid = PTR_TO_PID(userdata);
2947 
2948         for (;;) {
2949                 siginfo_t si = {};
2950 
2951                 if (waitid(P_ALL, 0, &si, WNOHANG|WNOWAIT|WEXITED) < 0)
2952                         return log_error_errno(errno, "Failed to waitid(): %m");
2953                 if (si.si_pid == 0) /* No pending children. */
2954                         break;
2955                 if (si.si_pid == pid) {
2956                         /* The main process we care for has exited. Return from
2957                          * signal handler but leave the zombie. */
2958                         sd_event_exit(sd_event_source_get_event(s), 0);
2959                         break;
2960                 }
2961 
2962                 /* Reap all other children. */
2963                 (void) waitid(P_PID, si.si_pid, &si, WNOHANG|WEXITED);
2964         }
2965 
2966         return 0;
2967 }
2968 
on_request_stop(sd_bus_message * m,void * userdata,sd_bus_error * error)2969 static int on_request_stop(sd_bus_message *m, void *userdata, sd_bus_error *error) {
2970         pid_t pid;
2971 
2972         assert(m);
2973 
2974         pid = PTR_TO_PID(userdata);
2975 
2976         if (arg_kill_signal > 0) {
2977                 log_info("Container termination requested. Attempting to halt container.");
2978                 (void) kill(pid, arg_kill_signal);
2979         } else {
2980                 log_info("Container termination requested. Exiting.");
2981                 sd_event_exit(sd_bus_get_event(sd_bus_message_get_bus(m)), 0);
2982         }
2983 
2984         return 0;
2985 }
2986 
determine_names(void)2987 static int determine_names(void) {
2988         int r;
2989 
2990         if (arg_template && !arg_directory && arg_machine) {
2991 
2992                 /* If --template= was specified then we should not
2993                  * search for a machine, but instead create a new one
2994                  * in /var/lib/machine. */
2995 
2996                 arg_directory = path_join("/var/lib/machines", arg_machine);
2997                 if (!arg_directory)
2998                         return log_oom();
2999         }
3000 
3001         if (!arg_image && !arg_directory) {
3002                 if (arg_machine) {
3003                         _cleanup_(image_unrefp) Image *i = NULL;
3004 
3005                         r = image_find(IMAGE_MACHINE, arg_machine, NULL, &i);
3006                         if (r == -ENOENT)
3007                                 return log_error_errno(r, "No image for machine '%s'.", arg_machine);
3008                         if (r < 0)
3009                                 return log_error_errno(r, "Failed to find image for machine '%s': %m", arg_machine);
3010 
3011                         if (IN_SET(i->type, IMAGE_RAW, IMAGE_BLOCK))
3012                                 r = free_and_strdup(&arg_image, i->path);
3013                         else
3014                                 r = free_and_strdup(&arg_directory, i->path);
3015                         if (r < 0)
3016                                 return log_oom();
3017 
3018                         if (!arg_ephemeral)
3019                                 arg_read_only = arg_read_only || i->read_only;
3020                 } else {
3021                         r = safe_getcwd(&arg_directory);
3022                         if (r < 0)
3023                                 return log_error_errno(r, "Failed to determine current directory: %m");
3024                 }
3025 
3026                 if (!arg_directory && !arg_image)
3027                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine path, please use -D or -i.");
3028         }
3029 
3030         if (!arg_machine) {
3031                 if (arg_directory && path_equal(arg_directory, "/"))
3032                         arg_machine = gethostname_malloc();
3033                 else if (arg_image) {
3034                         char *e;
3035 
3036                         arg_machine = strdup(basename(arg_image));
3037 
3038                         /* Truncate suffix if there is one */
3039                         e = endswith(arg_machine, ".raw");
3040                         if (e)
3041                                 *e = 0;
3042                 } else
3043                         arg_machine = strdup(basename(arg_directory));
3044                 if (!arg_machine)
3045                         return log_oom();
3046 
3047                 hostname_cleanup(arg_machine);
3048                 if (!hostname_is_valid(arg_machine, 0))
3049                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to determine machine name automatically, please use -M.");
3050 
3051                 /* Copy the machine name before the random suffix is added below, otherwise we won't be able
3052                  * to match fixed config file names. */
3053                 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3054                 if (!arg_settings_filename)
3055                         return log_oom();
3056 
3057                 /* Add a random suffix when this is an ephemeral machine, so that we can run many
3058                  * instances at once without manually having to specify -M each time. */
3059                 if (arg_ephemeral)
3060                         if (strextendf(&arg_machine, "-%016" PRIx64, random_u64()) < 0)
3061                                 return log_oom();
3062         } else {
3063                 arg_settings_filename = strjoin(arg_machine, ".nspawn");
3064                 if (!arg_settings_filename)
3065                         return log_oom();
3066         }
3067 
3068         return 0;
3069 }
3070 
chase_symlinks_and_update(char ** p,unsigned flags)3071 static int chase_symlinks_and_update(char **p, unsigned flags) {
3072         char *chased;
3073         int r;
3074 
3075         assert(p);
3076 
3077         if (!*p)
3078                 return 0;
3079 
3080         r = chase_symlinks(*p, NULL, flags, &chased, NULL);
3081         if (r < 0)
3082                 return log_error_errno(r, "Failed to resolve path %s: %m", *p);
3083 
3084         return free_and_replace(*p, chased);
3085 }
3086 
determine_uid_shift(const char * directory)3087 static int determine_uid_shift(const char *directory) {
3088 
3089         if (arg_userns_mode == USER_NAMESPACE_NO) {
3090                 arg_uid_shift = 0;
3091                 return 0;
3092         }
3093 
3094         if (arg_uid_shift == UID_INVALID) {
3095                 struct stat st;
3096 
3097                 /* Read the UID shift off the image. Maybe we can reuse this to avoid chowning. */
3098 
3099                 if (stat(directory, &st) < 0)
3100                         return log_error_errno(errno, "Failed to determine UID base of %s: %m", directory);
3101 
3102                 arg_uid_shift = st.st_uid & UINT32_C(0xffff0000);
3103 
3104                 if (arg_uid_shift != (st.st_gid & UINT32_C(0xffff0000)))
3105                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3106                                                "UID and GID base of %s don't match.", directory);
3107 
3108                 arg_uid_range = UINT32_C(0x10000);
3109 
3110                 if (arg_uid_shift != 0) {
3111                         /* If the image is shifted already, then we'll fall back to classic chowning, for
3112                          * compatibility (and simplicity), or refuse if mapping is explicitly requested.  */
3113 
3114                         if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_AUTO) {
3115                                 log_debug("UID base of %s is non-zero, not using UID mapping.", directory);
3116                                 arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3117                         } else if (arg_userns_ownership == USER_NAMESPACE_OWNERSHIP_MAP)
3118                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
3119                                                        "UID base of %s is not zero, UID mapping not supported.", directory);
3120                 }
3121         }
3122 
3123         if (!userns_shift_range_valid(arg_uid_shift, arg_uid_range))
3124                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "UID base too high for UID range.");
3125 
3126         return 0;
3127 }
3128 
effective_clone_ns_flags(void)3129 static unsigned long effective_clone_ns_flags(void) {
3130         unsigned long flags = arg_clone_ns_flags;
3131 
3132         if (arg_private_network)
3133                 flags |= CLONE_NEWNET;
3134         if (arg_use_cgns)
3135                 flags |= CLONE_NEWCGROUP;
3136         if (arg_userns_mode != USER_NAMESPACE_NO)
3137                 flags |= CLONE_NEWUSER;
3138 
3139         return flags;
3140 }
3141 
patch_sysctl(void)3142 static int patch_sysctl(void) {
3143 
3144         /* This table is inspired by runc's sysctl() function */
3145         static const struct {
3146                 const char *key;
3147                 bool prefix;
3148                 unsigned long clone_flags;
3149         } safe_sysctl[] = {
3150                 { "kernel.hostname",   false, CLONE_NEWUTS },
3151                 { "kernel.domainname", false, CLONE_NEWUTS },
3152                 { "kernel.msgmax",     false, CLONE_NEWIPC },
3153                 { "kernel.msgmnb",     false, CLONE_NEWIPC },
3154                 { "kernel.msgmni",     false, CLONE_NEWIPC },
3155                 { "kernel.sem",        false, CLONE_NEWIPC },
3156                 { "kernel.shmall",     false, CLONE_NEWIPC },
3157                 { "kernel.shmmax",     false, CLONE_NEWIPC },
3158                 { "kernel.shmmni",     false, CLONE_NEWIPC },
3159                 { "fs.mqueue.",        true,  CLONE_NEWIPC },
3160                 { "net.",              true,  CLONE_NEWNET },
3161         };
3162 
3163         unsigned long flags;
3164         int r;
3165 
3166         flags = effective_clone_ns_flags();
3167 
3168         STRV_FOREACH_PAIR(k, v, arg_sysctl) {
3169                 bool good = false;
3170                 size_t i;
3171 
3172                 for (i = 0; i < ELEMENTSOF(safe_sysctl); i++) {
3173 
3174                         if (!FLAGS_SET(flags, safe_sysctl[i].clone_flags))
3175                                 continue;
3176 
3177                         if (safe_sysctl[i].prefix)
3178                                 good = startswith(*k, safe_sysctl[i].key);
3179                         else
3180                                 good = streq(*k, safe_sysctl[i].key);
3181 
3182                         if (good)
3183                                 break;
3184                 }
3185 
3186                 if (!good)
3187                         return log_error_errno(SYNTHETIC_ERRNO(EPERM), "Refusing to write to sysctl '%s', as it is not safe in the selected namespaces.", *k);
3188 
3189                 r = sysctl_write(*k, *v);
3190                 if (r < 0)
3191                         return log_error_errno(r, "Failed to write sysctl '%s': %m", *k);
3192         }
3193 
3194         return 0;
3195 }
3196 
inner_child(Barrier * barrier,const char * directory,bool secondary,int kmsg_socket,int rtnl_socket,int master_pty_socket,FDSet * fds,char ** os_release_pairs)3197 static int inner_child(
3198                 Barrier *barrier,
3199                 const char *directory,
3200                 bool secondary,
3201                 int kmsg_socket,
3202                 int rtnl_socket,
3203                 int master_pty_socket,
3204                 FDSet *fds,
3205                 char **os_release_pairs) {
3206 
3207         _cleanup_free_ char *home = NULL;
3208         size_t n_env = 1;
3209         char *envp[] = {
3210                 (char*) "PATH=" DEFAULT_PATH_COMPAT,
3211                 NULL, /* container */
3212                 NULL, /* TERM */
3213                 NULL, /* HOME */
3214                 NULL, /* USER */
3215                 NULL, /* LOGNAME */
3216                 NULL, /* container_uuid */
3217                 NULL, /* LISTEN_FDS */
3218                 NULL, /* LISTEN_PID */
3219                 NULL, /* NOTIFY_SOCKET */
3220                 NULL, /* CREDENTIALS_DIRECTORY */
3221                 NULL, /* LANG */
3222                 NULL
3223         };
3224         const char *exec_target;
3225         _cleanup_strv_free_ char **env_use = NULL;
3226         int r, which_failed;
3227 
3228         /* This is the "inner" child process, i.e. the one forked off by the "outer" child process, which is the one
3229          * the container manager itself forked off. At the time of clone() it gained its own CLONE_NEWNS, CLONE_NEWPID,
3230          * CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER namespaces. Note that it has its own CLONE_NEWNS namespace,
3231          * separate from the CLONE_NEWNS created for the "outer" child, and also separate from the host's CLONE_NEWNS
3232          * namespace. The reason for having two levels of CLONE_NEWNS namespaces is that the "inner" one is owned by
3233          * the CLONE_NEWUSER namespace of the container, while the "outer" one is owned by the host's CLONE_NEWUSER
3234          * namespace.
3235          *
3236          * Note at this point we have no CLONE_NEWNET namespace yet. We'll acquire that one later through
3237          * unshare(). See below. */
3238 
3239         assert(barrier);
3240         assert(directory);
3241         assert(kmsg_socket >= 0);
3242 
3243         log_debug("Inner child is initializing.");
3244 
3245         if (arg_userns_mode != USER_NAMESPACE_NO) {
3246                 /* Tell the parent, that it now can write the UID map. */
3247                 (void) barrier_place(barrier); /* #1 */
3248 
3249                 /* Wait until the parent wrote the UID map */
3250                 if (!barrier_place_and_sync(barrier)) /* #2 */
3251                         return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3252 
3253                 /* Become the new root user inside our namespace */
3254                 r = reset_uid_gid();
3255                 if (r < 0)
3256                         return log_error_errno(r, "Couldn't become new root: %m");
3257 
3258                 /* Creating a new user namespace means all MS_SHARED mounts become MS_SLAVE. Let's put them
3259                  * back to MS_SHARED here, since that's what we want as defaults. (This will not reconnect
3260                  * propagation, but simply create new peer groups for all our mounts). */
3261                 r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SHARED|MS_REC, NULL);
3262                 if (r < 0)
3263                         return r;
3264         }
3265 
3266         r = mount_all(NULL,
3267                       arg_mount_settings | MOUNT_IN_USERNS,
3268                       arg_uid_shift,
3269                       arg_selinux_apifs_context);
3270         if (r < 0)
3271                 return r;
3272 
3273         if (!arg_network_namespace_path && arg_private_network) {
3274                 r = unshare(CLONE_NEWNET);
3275                 if (r < 0)
3276                         return log_error_errno(errno, "Failed to unshare network namespace: %m");
3277 
3278                 /* Tell the parent that it can setup network interfaces. */
3279                 (void) barrier_place(barrier); /* #3 */
3280         }
3281 
3282         r = mount_sysfs(NULL, arg_mount_settings);
3283         if (r < 0)
3284                 return r;
3285 
3286         /* Wait until we are cgroup-ified, so that we
3287          * can mount the right cgroup path writable */
3288         if (!barrier_place_and_sync(barrier)) /* #4 */
3289                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH),
3290                                        "Parent died too early");
3291 
3292         if (arg_use_cgns) {
3293                 r = unshare(CLONE_NEWCGROUP);
3294                 if (r < 0)
3295                         return log_error_errno(errno, "Failed to unshare cgroup namespace: %m");
3296                 r = mount_cgroups(
3297                                 "",
3298                                 arg_unified_cgroup_hierarchy,
3299                                 arg_userns_mode != USER_NAMESPACE_NO,
3300                                 arg_uid_shift,
3301                                 arg_uid_range,
3302                                 arg_selinux_apifs_context,
3303                                 true);
3304         } else
3305                 r = mount_systemd_cgroup_writable("", arg_unified_cgroup_hierarchy);
3306         if (r < 0)
3307                 return r;
3308 
3309         r = setup_boot_id();
3310         if (r < 0)
3311                 return r;
3312 
3313         r = setup_kmsg(kmsg_socket);
3314         if (r < 0)
3315                 return r;
3316         kmsg_socket = safe_close(kmsg_socket);
3317 
3318         r = mount_custom(
3319                         "/",
3320                         arg_custom_mounts,
3321                         arg_n_custom_mounts,
3322                         0,
3323                         0,
3324                         arg_selinux_apifs_context,
3325                         MOUNT_NON_ROOT_ONLY | MOUNT_IN_USERNS);
3326         if (r < 0)
3327                 return r;
3328 
3329         if (setsid() < 0)
3330                 return log_error_errno(errno, "setsid() failed: %m");
3331 
3332         if (arg_private_network)
3333                 (void) loopback_setup();
3334 
3335         if (arg_expose_ports) {
3336                 r = expose_port_send_rtnl(rtnl_socket);
3337                 if (r < 0)
3338                         return r;
3339                 rtnl_socket = safe_close(rtnl_socket);
3340         }
3341 
3342         if (arg_console_mode != CONSOLE_PIPE) {
3343                 _cleanup_close_ int master = -1;
3344                 _cleanup_free_ char *console = NULL;
3345 
3346                 /* Allocate a pty and make it available as /dev/console. */
3347                 master = openpt_allocate(O_RDWR|O_NONBLOCK, &console);
3348                 if (master < 0)
3349                         return log_error_errno(master, "Failed to allocate a pty: %m");
3350 
3351                 r = setup_dev_console(console);
3352                 if (r < 0)
3353                         return log_error_errno(r, "Failed to set up /dev/console: %m");
3354 
3355                 r = send_one_fd(master_pty_socket, master, 0);
3356                 if (r < 0)
3357                         return log_error_errno(r, "Failed to send master fd: %m");
3358                 master_pty_socket = safe_close(master_pty_socket);
3359 
3360                 r = setup_stdio_as_dev_console();
3361                 if (r < 0)
3362                         return r;
3363         }
3364 
3365         r = patch_sysctl();
3366         if (r < 0)
3367                 return r;
3368 
3369         if (arg_oom_score_adjust_set) {
3370                 r = set_oom_score_adjust(arg_oom_score_adjust);
3371                 if (r < 0)
3372                         return log_error_errno(r, "Failed to adjust OOM score: %m");
3373         }
3374 
3375         if (arg_cpu_set.set)
3376                 if (sched_setaffinity(0, arg_cpu_set.allocated, arg_cpu_set.set) < 0)
3377                         return log_error_errno(errno, "Failed to set CPU affinity: %m");
3378 
3379         (void) setup_hostname();
3380 
3381         if (arg_personality != PERSONALITY_INVALID) {
3382                 r = safe_personality(arg_personality);
3383                 if (r < 0)
3384                         return log_error_errno(r, "personality() failed: %m");
3385         } else if (secondary) {
3386                 r = safe_personality(PER_LINUX32);
3387                 if (r < 0)
3388                         return log_error_errno(r, "personality() failed: %m");
3389         }
3390 
3391         r = setrlimit_closest_all((const struct rlimit *const*) arg_rlimit, &which_failed);
3392         if (r < 0)
3393                 return log_error_errno(r, "Failed to apply resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3394 
3395 #if HAVE_SECCOMP
3396         if (arg_seccomp) {
3397 
3398                 if (is_seccomp_available()) {
3399 
3400                         r = seccomp_load(arg_seccomp);
3401                         if (ERRNO_IS_SECCOMP_FATAL(r))
3402                                 return log_error_errno(r, "Failed to install seccomp filter: %m");
3403                         if (r < 0)
3404                                 log_debug_errno(r, "Failed to install seccomp filter: %m");
3405                 }
3406         } else
3407 #endif
3408         {
3409                 r = setup_seccomp(arg_caps_retain, arg_syscall_allow_list, arg_syscall_deny_list);
3410                 if (r < 0)
3411                         return r;
3412         }
3413 
3414         if (arg_suppress_sync) {
3415 #if HAVE_SECCOMP
3416                 r = seccomp_suppress_sync();
3417                 if (r < 0)
3418                         log_debug_errno(r, "Failed to install sync() suppression seccomp filter, ignoring: %m");
3419 #else
3420                 log_debug("systemd is built without SECCOMP support. Ignoring --suppress-sync= command line option and SuppressSync= setting.");
3421 #endif
3422         }
3423 
3424 #if HAVE_SELINUX
3425         if (arg_selinux_context)
3426                 if (setexeccon(arg_selinux_context) < 0)
3427                         return log_error_errno(errno, "setexeccon(\"%s\") failed: %m", arg_selinux_context);
3428 #endif
3429 
3430         /* Make sure we keep the caps across the uid/gid dropping, so that we can retain some selected caps
3431          * if we need to later on. */
3432         if (prctl(PR_SET_KEEPCAPS, 1) < 0)
3433                 return log_error_errno(errno, "Failed to set PR_SET_KEEPCAPS: %m");
3434 
3435         if (uid_is_valid(arg_uid) || gid_is_valid(arg_gid))
3436                 r = change_uid_gid_raw(arg_uid, arg_gid, arg_supplementary_gids, arg_n_supplementary_gids, arg_console_mode != CONSOLE_PIPE);
3437         else
3438                 r = change_uid_gid(arg_user, arg_console_mode != CONSOLE_PIPE, &home);
3439         if (r < 0)
3440                 return r;
3441 
3442         r = drop_capabilities(getuid());
3443         if (r < 0)
3444                 return log_error_errno(r, "Dropping capabilities failed: %m");
3445 
3446         if (arg_no_new_privileges)
3447                 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0)
3448                         return log_error_errno(errno, "Failed to disable new privileges: %m");
3449 
3450         /* LXC sets container=lxc, so follow the scheme here */
3451         envp[n_env++] = strjoina("container=", arg_container_service_name);
3452 
3453         envp[n_env] = strv_find_prefix(environ, "TERM=");
3454         if (envp[n_env])
3455                 n_env++;
3456 
3457         if (home || !uid_is_valid(arg_uid) || arg_uid == 0)
3458                 if (asprintf(envp + n_env++, "HOME=%s", home ?: "/root") < 0)
3459                         return log_oom();
3460 
3461         if (arg_user || !uid_is_valid(arg_uid) || arg_uid == 0)
3462                 if (asprintf(envp + n_env++, "USER=%s", arg_user ?: "root") < 0 ||
3463                     asprintf(envp + n_env++, "LOGNAME=%s", arg_user ? arg_user : "root") < 0)
3464                         return log_oom();
3465 
3466         assert(!sd_id128_is_null(arg_uuid));
3467 
3468         if (asprintf(envp + n_env++, "container_uuid=%s", SD_ID128_TO_UUID_STRING(arg_uuid)) < 0)
3469                 return log_oom();
3470 
3471         if (fdset_size(fds) > 0) {
3472                 r = fdset_cloexec(fds, false);
3473                 if (r < 0)
3474                         return log_error_errno(r, "Failed to unset O_CLOEXEC for file descriptors.");
3475 
3476                 if ((asprintf(envp + n_env++, "LISTEN_FDS=%u", fdset_size(fds)) < 0) ||
3477                     (asprintf(envp + n_env++, "LISTEN_PID=1") < 0))
3478                         return log_oom();
3479         }
3480         if (asprintf(envp + n_env++, "NOTIFY_SOCKET=%s", NSPAWN_NOTIFY_SOCKET_PATH) < 0)
3481                 return log_oom();
3482 
3483         if (arg_n_credentials > 0) {
3484                 envp[n_env] = strdup("CREDENTIALS_DIRECTORY=/run/host/credentials");
3485                 if (!envp[n_env])
3486                         return log_oom();
3487                 n_env++;
3488         }
3489 
3490         if (arg_start_mode != START_BOOT) {
3491                 /* If we're running a command in the container, let's default to the C.UTF-8 locale as it's
3492                  * part of glibc these days and was backported to most distros a long time before it got
3493                  * added to upstream glibc. */
3494                 envp[n_env] = strdup("LANG=C.UTF-8");
3495                 if (!envp[n_env])
3496                         return log_oom();
3497                 n_env++;
3498         }
3499 
3500         env_use = strv_env_merge(envp, os_release_pairs, arg_setenv);
3501         if (!env_use)
3502                 return log_oom();
3503 
3504         /* Let the parent know that we are ready and
3505          * wait until the parent is ready with the
3506          * setup, too... */
3507         if (!barrier_place_and_sync(barrier)) /* #5 */
3508                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Parent died too early");
3509 
3510         if (arg_chdir)
3511                 if (chdir(arg_chdir) < 0)
3512                         return log_error_errno(errno, "Failed to change to specified working directory %s: %m", arg_chdir);
3513 
3514         if (arg_start_mode == START_PID2) {
3515                 r = stub_pid1(arg_uuid);
3516                 if (r < 0)
3517                         return r;
3518         }
3519 
3520         if (arg_console_mode != CONSOLE_PIPE) {
3521                 /* So far our pty wasn't controlled by any process. Finally, it's time to change that, if we
3522                  * are configured for that. Acquire it as controlling tty. */
3523                 if (ioctl(STDIN_FILENO, TIOCSCTTY) < 0)
3524                         return log_error_errno(errno, "Failed to acquire controlling TTY: %m");
3525         }
3526 
3527         log_debug("Inner child completed, invoking payload.");
3528 
3529         /* Now, explicitly close the log, so that we then can close all remaining fds. Closing the log explicitly first
3530          * has the benefit that the logging subsystem knows about it, and is thus ready to be reopened should we need
3531          * it again. Note that the other fds closed here are at least the locking and barrier fds. */
3532         log_close();
3533         log_set_open_when_needed(true);
3534 
3535         (void) fdset_close_others(fds);
3536 
3537         if (arg_start_mode == START_BOOT) {
3538                 char **a;
3539                 size_t m;
3540 
3541                 /* Automatically search for the init system */
3542 
3543                 m = strv_length(arg_parameters);
3544                 a = newa(char*, m + 2);
3545                 memcpy_safe(a + 1, arg_parameters, m * sizeof(char*));
3546                 a[1 + m] = NULL;
3547 
3548                 FOREACH_STRING(init,
3549                                "/usr/lib/systemd/systemd",
3550                                "/lib/systemd/systemd",
3551                                "/sbin/init") {
3552                         a[0] = (char*) init;
3553                         execve(a[0], a, env_use);
3554                 }
3555 
3556                 exec_target = "/usr/lib/systemd/systemd, /lib/systemd/systemd, /sbin/init";
3557         } else if (!strv_isempty(arg_parameters)) {
3558                 const char *dollar_path;
3559 
3560                 exec_target = arg_parameters[0];
3561 
3562                 /* Use the user supplied search $PATH if there is one, or DEFAULT_PATH_COMPAT if not to search the
3563                  * binary. */
3564                 dollar_path = strv_env_get(env_use, "PATH");
3565                 if (dollar_path) {
3566                         if (setenv("PATH", dollar_path, 1) < 0)
3567                                 return log_error_errno(errno, "Failed to update $PATH: %m");
3568                 }
3569 
3570                 execvpe(arg_parameters[0], arg_parameters, env_use);
3571         } else {
3572                 if (!arg_chdir)
3573                         /* If we cannot change the directory, we'll end up in /, that is expected. */
3574                         (void) chdir(home ?: "/root");
3575 
3576                 execle(DEFAULT_USER_SHELL, "-" DEFAULT_USER_SHELL_NAME, NULL, env_use);
3577                 if (!streq(DEFAULT_USER_SHELL, "/bin/bash"))
3578                         execle("/bin/bash", "-bash", NULL, env_use);
3579                 if (!streq(DEFAULT_USER_SHELL, "/bin/sh"))
3580                         execle("/bin/sh", "-sh", NULL, env_use);
3581 
3582                 exec_target = DEFAULT_USER_SHELL ", /bin/bash, /bin/sh";
3583         }
3584 
3585         return log_error_errno(errno, "execv(%s) failed: %m", exec_target);
3586 }
3587 
setup_notify_child(void)3588 static int setup_notify_child(void) {
3589         _cleanup_close_ int fd = -1;
3590         static const union sockaddr_union sa = {
3591                 .un.sun_family = AF_UNIX,
3592                 .un.sun_path = NSPAWN_NOTIFY_SOCKET_PATH,
3593         };
3594         int r;
3595 
3596         fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
3597         if (fd < 0)
3598                 return log_error_errno(errno, "Failed to allocate notification socket: %m");
3599 
3600         (void) mkdir_parents(NSPAWN_NOTIFY_SOCKET_PATH, 0755);
3601         (void) sockaddr_un_unlink(&sa.un);
3602 
3603         r = bind(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un));
3604         if (r < 0)
3605                 return log_error_errno(errno, "bind(" NSPAWN_NOTIFY_SOCKET_PATH ") failed: %m");
3606 
3607         r = userns_lchown(NSPAWN_NOTIFY_SOCKET_PATH, 0, 0);
3608         if (r < 0)
3609                 return log_error_errno(r, "Failed to chown " NSPAWN_NOTIFY_SOCKET_PATH ": %m");
3610 
3611         r = setsockopt_int(fd, SOL_SOCKET, SO_PASSCRED, true);
3612         if (r < 0)
3613                 return log_error_errno(r, "SO_PASSCRED failed: %m");
3614 
3615         return TAKE_FD(fd);
3616 }
3617 
outer_child(Barrier * barrier,const char * directory,DissectedImage * dissected_image,bool secondary,int pid_socket,int uuid_socket,int notify_socket,int kmsg_socket,int rtnl_socket,int uid_shift_socket,int master_pty_socket,int unified_cgroup_hierarchy_socket,FDSet * fds,int netns_fd)3618 static int outer_child(
3619                 Barrier *barrier,
3620                 const char *directory,
3621                 DissectedImage *dissected_image,
3622                 bool secondary,
3623                 int pid_socket,
3624                 int uuid_socket,
3625                 int notify_socket,
3626                 int kmsg_socket,
3627                 int rtnl_socket,
3628                 int uid_shift_socket,
3629                 int master_pty_socket,
3630                 int unified_cgroup_hierarchy_socket,
3631                 FDSet *fds,
3632                 int netns_fd) {
3633 
3634         _cleanup_(bind_user_context_freep) BindUserContext *bind_user_context = NULL;
3635         _cleanup_strv_free_ char **os_release_pairs = NULL;
3636         _cleanup_close_ int fd = -1;
3637         bool idmap = false;
3638         const char *p;
3639         pid_t pid;
3640         ssize_t l;
3641         int r;
3642 
3643         /* This is the "outer" child process, i.e the one forked off by the container manager itself. It
3644          * already has its own CLONE_NEWNS namespace (which was created by the clone()). It still lives in
3645          * the host's CLONE_NEWPID, CLONE_NEWUTS, CLONE_NEWIPC, CLONE_NEWUSER and CLONE_NEWNET
3646          * namespaces. After it completed a number of initializations a second child (the "inner" one) is
3647          * forked off it, and it exits. */
3648 
3649         assert(barrier);
3650         assert(directory);
3651         assert(pid_socket >= 0);
3652         assert(uuid_socket >= 0);
3653         assert(notify_socket >= 0);
3654         assert(master_pty_socket >= 0);
3655         assert(kmsg_socket >= 0);
3656 
3657         log_debug("Outer child is initializing.");
3658 
3659         r = load_os_release_pairs_with_prefix("/", "container_host_", &os_release_pairs);
3660         if (r < 0)
3661                 log_debug_errno(r, "Failed to read os-release from host for container, ignoring: %m");
3662 
3663         if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
3664                 return log_error_errno(errno, "PR_SET_PDEATHSIG failed: %m");
3665 
3666         r = reset_audit_loginuid();
3667         if (r < 0)
3668                 return r;
3669 
3670         /* Mark everything as slave, so that we still receive mounts from the real root, but don't propagate
3671          * mounts to the real root. */
3672         r = mount_follow_verbose(LOG_ERR, NULL, "/", NULL, MS_SLAVE|MS_REC, NULL);
3673         if (r < 0)
3674                 return r;
3675 
3676         if (dissected_image) {
3677                 /* If we are operating on a disk image, then mount its root directory now, but leave out the
3678                  * rest. We can read the UID shift from it if we need to. Further down we'll mount the rest,
3679                  * but then with the uid shift known. That way we can mount VFAT file systems shifted to the
3680                  * right place right away. This makes sure ESP partitions and userns are compatible. */
3681 
3682                 r = dissected_image_mount_and_warn(
3683                                 dissected_image,
3684                                 directory,
3685                                 arg_uid_shift,
3686                                 arg_uid_range,
3687                                 DISSECT_IMAGE_MOUNT_ROOT_ONLY|
3688                                 DISSECT_IMAGE_DISCARD_ON_LOOP|
3689                                 DISSECT_IMAGE_USR_NO_ROOT|
3690                                 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3691                                 (arg_start_mode == START_BOOT ? DISSECT_IMAGE_VALIDATE_OS : 0));
3692                 if (r < 0)
3693                         return r;
3694         }
3695 
3696         r = determine_uid_shift(directory);
3697         if (r < 0)
3698                 return r;
3699 
3700         if (arg_userns_mode != USER_NAMESPACE_NO) {
3701                 /* Let the parent know which UID shift we read from the image */
3702                 l = send(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), MSG_NOSIGNAL);
3703                 if (l < 0)
3704                         return log_error_errno(errno, "Failed to send UID shift: %m");
3705                 if (l != sizeof(arg_uid_shift))
3706                         return log_error_errno(SYNTHETIC_ERRNO(EIO),
3707                                                "Short write while sending UID shift.");
3708 
3709                 if (arg_userns_mode == USER_NAMESPACE_PICK) {
3710                         /* When we are supposed to pick the UID shift, the parent will check now whether the
3711                          * UID shift we just read from the image is available. If yes, it will send the UID
3712                          * shift back to us, if not it will pick a different one, and send it back to us. */
3713 
3714                         l = recv(uid_shift_socket, &arg_uid_shift, sizeof(arg_uid_shift), 0);
3715                         if (l < 0)
3716                                 return log_error_errno(errno, "Failed to recv UID shift: %m");
3717                         if (l != sizeof(arg_uid_shift))
3718                                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3719                                                        "Short read while receiving UID shift.");
3720                 }
3721 
3722                 log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
3723                          "Selected user namespace base " UID_FMT " and range " UID_FMT ".", arg_uid_shift, arg_uid_range);
3724         }
3725 
3726         if (path_equal(directory, "/")) {
3727                 /* If the directory we shall boot is the host, let's operate on a bind mount at a different
3728                  * place, so that we can make changes to its mount structure (for example, to implement
3729                  * --volatile=) without this interfering with our ability to access files such as
3730                  * /etc/localtime to copy into the container. Note that we use a fixed place for this
3731                  * (instead of a temporary directory, since we are living in our own mount namspace here
3732                  * already, and thus don't need to be afraid of colliding with anyone else's mounts). */
3733                 (void) mkdir_p("/run/systemd/nspawn-root", 0755);
3734 
3735                 r = mount_nofollow_verbose(LOG_ERR, "/", "/run/systemd/nspawn-root", NULL, MS_BIND|MS_REC, NULL);
3736                 if (r < 0)
3737                         return r;
3738 
3739                 directory = "/run/systemd/nspawn-root";
3740         }
3741 
3742         r = setup_pivot_root(
3743                         directory,
3744                         arg_pivot_root_new,
3745                         arg_pivot_root_old);
3746         if (r < 0)
3747                 return r;
3748 
3749         r = setup_volatile_mode(
3750                         directory,
3751                         arg_volatile_mode,
3752                         arg_uid_shift,
3753                         arg_selinux_apifs_context);
3754         if (r < 0)
3755                 return r;
3756 
3757         r = bind_user_prepare(
3758                         directory,
3759                         arg_bind_user,
3760                         arg_uid_shift,
3761                         arg_uid_range,
3762                         &arg_custom_mounts, &arg_n_custom_mounts,
3763                         &bind_user_context);
3764         if (r < 0)
3765                 return r;
3766 
3767         if (arg_userns_mode != USER_NAMESPACE_NO && bind_user_context) {
3768                 /* Send the user maps we determined to the parent, so that it installs it in our user
3769                  * namespace UID map table */
3770 
3771                 for (size_t i = 0; i < bind_user_context->n_data; i++)  {
3772                         uid_t map[] = {
3773                                 bind_user_context->data[i].payload_user->uid,
3774                                 bind_user_context->data[i].host_user->uid,
3775                                 (uid_t) bind_user_context->data[i].payload_group->gid,
3776                                 (uid_t) bind_user_context->data[i].host_group->gid,
3777                         };
3778 
3779                         l = send(uid_shift_socket, map, sizeof(map), MSG_NOSIGNAL);
3780                         if (l < 0)
3781                                 return log_error_errno(errno, "Failed to send user UID map: %m");
3782                         if (l != sizeof(map))
3783                                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
3784                                                        "Short write while sending user UID map.");
3785                 }
3786         }
3787 
3788         r = mount_custom(
3789                         directory,
3790                         arg_custom_mounts,
3791                         arg_n_custom_mounts,
3792                         arg_uid_shift,
3793                         arg_uid_range,
3794                         arg_selinux_apifs_context,
3795                         MOUNT_ROOT_ONLY);
3796         if (r < 0)
3797                 return r;
3798 
3799         /* Make sure we always have a mount that we can move to root later on. */
3800         r = make_mount_point(directory);
3801         if (r < 0)
3802                 return r;
3803 
3804         if (arg_userns_mode != USER_NAMESPACE_NO &&
3805             IN_SET(arg_userns_ownership, USER_NAMESPACE_OWNERSHIP_MAP, USER_NAMESPACE_OWNERSHIP_AUTO) &&
3806             arg_uid_shift != 0) {
3807 
3808                 r = remount_idmap(directory, arg_uid_shift, arg_uid_range, REMOUNT_IDMAP_HOST_ROOT);
3809                 if (r == -EINVAL || ERRNO_IS_NOT_SUPPORTED(r)) {
3810                         /* This might fail because the kernel or file system doesn't support idmapping. We
3811                          * can't really distinguish this nicely, nor do we have any guarantees about the
3812                          * error codes we see, could be EOPNOTSUPP or EINVAL. */
3813                         if (arg_userns_ownership != USER_NAMESPACE_OWNERSHIP_AUTO)
3814                                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3815                                                        "ID mapped mounts are apparently not available, sorry.");
3816 
3817                         log_debug("ID mapped mounts are apparently not available on this kernel or for the selected file system, reverting to recursive chown()ing.");
3818                         arg_userns_ownership = USER_NAMESPACE_OWNERSHIP_CHOWN;
3819                 } else if (r < 0)
3820                         return log_error_errno(r, "Failed to set up ID mapped mounts: %m");
3821                 else {
3822                         log_debug("ID mapped mounts available, making use of them.");
3823                         idmap = true;
3824                 }
3825         }
3826 
3827         if (dissected_image) {
3828                 /* Now we know the uid shift, let's now mount everything else that might be in the image. */
3829                 r = dissected_image_mount(
3830                                 dissected_image,
3831                                 directory,
3832                                 arg_uid_shift,
3833                                 arg_uid_range,
3834                                 DISSECT_IMAGE_MOUNT_NON_ROOT_ONLY|
3835                                 DISSECT_IMAGE_DISCARD_ON_LOOP|
3836                                 DISSECT_IMAGE_USR_NO_ROOT|
3837                                 (arg_read_only ? DISSECT_IMAGE_READ_ONLY : DISSECT_IMAGE_FSCK|DISSECT_IMAGE_GROWFS)|
3838                                 (idmap ? DISSECT_IMAGE_MOUNT_IDMAPPED : 0));
3839                 if (r == -EUCLEAN)
3840                         return log_error_errno(r, "File system check for image failed: %m");
3841                 if (r < 0)
3842                         return log_error_errno(r, "Failed to mount image file system: %m");
3843         }
3844 
3845         if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
3846                 /* OK, we don't know yet which cgroup mode to use yet. Let's figure it out, and tell the parent. */
3847 
3848                 r = detect_unified_cgroup_hierarchy_from_image(directory);
3849                 if (r < 0)
3850                         return r;
3851 
3852                 l = send(unified_cgroup_hierarchy_socket, &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), MSG_NOSIGNAL);
3853                 if (l < 0)
3854                         return log_error_errno(errno, "Failed to send cgroup mode: %m");
3855                 if (l != sizeof(arg_unified_cgroup_hierarchy))
3856                         return log_error_errno(SYNTHETIC_ERRNO(EIO),
3857                                                "Short write while sending cgroup mode.");
3858 
3859                 unified_cgroup_hierarchy_socket = safe_close(unified_cgroup_hierarchy_socket);
3860         }
3861 
3862         /* Mark everything as shared so our mounts get propagated down. This is required to make new bind
3863          * mounts available in systemd services inside the container that create a new mount namespace.  See
3864          * https://github.com/systemd/systemd/issues/3860 Further submounts (such as /dev) done after this
3865          * will inherit the shared propagation mode.
3866          *
3867          * IMPORTANT: Do not overmount the root directory anymore from now on to enable moving the root
3868          * directory mount to root later on.
3869          * https://github.com/systemd/systemd/issues/3847#issuecomment-562735251
3870          */
3871         r = mount_nofollow_verbose(LOG_ERR, NULL, directory, NULL, MS_SHARED|MS_REC, NULL);
3872         if (r < 0)
3873                 return r;
3874 
3875         r = recursive_chown(directory, arg_uid_shift, arg_uid_range);
3876         if (r < 0)
3877                 return r;
3878 
3879         r = base_filesystem_create(directory, arg_uid_shift, (gid_t) arg_uid_shift);
3880         if (r < 0)
3881                 return r;
3882 
3883         if (arg_read_only && arg_volatile_mode == VOLATILE_NO &&
3884                 !has_custom_root_mount(arg_custom_mounts, arg_n_custom_mounts)) {
3885                 r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
3886                 if (r < 0)
3887                         return log_error_errno(r, "Failed to make tree read-only: %m");
3888         }
3889 
3890         r = mount_all(directory,
3891                       arg_mount_settings,
3892                       arg_uid_shift,
3893                       arg_selinux_apifs_context);
3894         if (r < 0)
3895                 return r;
3896 
3897         r = copy_devnodes(directory);
3898         if (r < 0)
3899                 return r;
3900 
3901         r = make_extra_nodes(directory);
3902         if (r < 0)
3903                 return r;
3904 
3905         (void) dev_setup(directory, arg_uid_shift, arg_uid_shift);
3906 
3907         p = prefix_roota(directory, "/run/host");
3908         (void) make_inaccessible_nodes(p, arg_uid_shift, arg_uid_shift);
3909 
3910         r = setup_pts(directory);
3911         if (r < 0)
3912                 return r;
3913 
3914         r = setup_propagate(directory);
3915         if (r < 0)
3916                 return r;
3917 
3918         r = setup_keyring();
3919         if (r < 0)
3920                 return r;
3921 
3922         r = setup_credentials(directory);
3923         if (r < 0)
3924                 return r;
3925 
3926         r = bind_user_setup(bind_user_context, directory);
3927         if (r < 0)
3928                 return r;
3929 
3930         r = mount_custom(
3931                         directory,
3932                         arg_custom_mounts,
3933                         arg_n_custom_mounts,
3934                         arg_uid_shift,
3935                         arg_uid_range,
3936                         arg_selinux_apifs_context,
3937                         MOUNT_NON_ROOT_ONLY);
3938         if (r < 0)
3939                 return r;
3940 
3941         r = setup_timezone(directory);
3942         if (r < 0)
3943                 return r;
3944 
3945         r = setup_resolv_conf(directory);
3946         if (r < 0)
3947                 return r;
3948 
3949         r = setup_machine_id(directory);
3950         if (r < 0)
3951                 return r;
3952 
3953         r = setup_journal(directory);
3954         if (r < 0)
3955                 return r;
3956 
3957         /* The same stuff as the $container env var, but nicely readable for the entire payload */
3958         p = prefix_roota(directory, "/run/host/container-manager");
3959         (void) write_string_file(p, arg_container_service_name, WRITE_STRING_FILE_CREATE);
3960 
3961         /* The same stuff as the $container_uuid env var */
3962         p = prefix_roota(directory, "/run/host/container-uuid");
3963         (void) write_string_filef(p, WRITE_STRING_FILE_CREATE, SD_ID128_UUID_FORMAT_STR, SD_ID128_FORMAT_VAL(arg_uuid));
3964 
3965         if (!arg_use_cgns) {
3966                 r = mount_cgroups(
3967                                 directory,
3968                                 arg_unified_cgroup_hierarchy,
3969                                 arg_userns_mode != USER_NAMESPACE_NO,
3970                                 arg_uid_shift,
3971                                 arg_uid_range,
3972                                 arg_selinux_apifs_context,
3973                                 false);
3974                 if (r < 0)
3975                         return r;
3976         }
3977 
3978         r = mount_move_root(directory);
3979         if (r < 0)
3980                 return log_error_errno(r, "Failed to move root directory: %m");
3981 
3982         fd = setup_notify_child();
3983         if (fd < 0)
3984                 return fd;
3985 
3986         pid = raw_clone(SIGCHLD|CLONE_NEWNS|
3987                         arg_clone_ns_flags |
3988                         (arg_userns_mode != USER_NAMESPACE_NO ? CLONE_NEWUSER : 0));
3989         if (pid < 0)
3990                 return log_error_errno(errno, "Failed to fork inner child: %m");
3991         if (pid == 0) {
3992                 pid_socket = safe_close(pid_socket);
3993                 uuid_socket = safe_close(uuid_socket);
3994                 notify_socket = safe_close(notify_socket);
3995                 uid_shift_socket = safe_close(uid_shift_socket);
3996 
3997                 /* The inner child has all namespaces that are requested, so that we all are owned by the
3998                  * user if user namespaces are turned on. */
3999 
4000                 if (arg_network_namespace_path) {
4001                         r = namespace_enter(-1, -1, netns_fd, -1, -1);
4002                         if (r < 0)
4003                                 return log_error_errno(r, "Failed to join network namespace: %m");
4004                 }
4005 
4006                 r = inner_child(barrier, directory, secondary, kmsg_socket, rtnl_socket, master_pty_socket, fds, os_release_pairs);
4007                 if (r < 0)
4008                         _exit(EXIT_FAILURE);
4009 
4010                 _exit(EXIT_SUCCESS);
4011         }
4012 
4013         l = send(pid_socket, &pid, sizeof(pid), MSG_NOSIGNAL);
4014         if (l < 0)
4015                 return log_error_errno(errno, "Failed to send PID: %m");
4016         if (l != sizeof(pid))
4017                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4018                                        "Short write while sending PID.");
4019 
4020         l = send(uuid_socket, &arg_uuid, sizeof(arg_uuid), MSG_NOSIGNAL);
4021         if (l < 0)
4022                 return log_error_errno(errno, "Failed to send machine ID: %m");
4023         if (l != sizeof(arg_uuid))
4024                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
4025                                        "Short write while sending machine ID.");
4026 
4027         l = send_one_fd(notify_socket, fd, 0);
4028         if (l < 0)
4029                 return log_error_errno(l, "Failed to send notify fd: %m");
4030 
4031         pid_socket = safe_close(pid_socket);
4032         uuid_socket = safe_close(uuid_socket);
4033         notify_socket = safe_close(notify_socket);
4034         master_pty_socket = safe_close(master_pty_socket);
4035         kmsg_socket = safe_close(kmsg_socket);
4036         rtnl_socket = safe_close(rtnl_socket);
4037         netns_fd = safe_close(netns_fd);
4038 
4039         return 0;
4040 }
4041 
uid_shift_pick(uid_t * shift,LockFile * ret_lock_file)4042 static int uid_shift_pick(uid_t *shift, LockFile *ret_lock_file) {
4043         bool tried_hashed = false;
4044         unsigned n_tries = 100;
4045         uid_t candidate;
4046         int r;
4047 
4048         assert(shift);
4049         assert(ret_lock_file);
4050         assert(arg_userns_mode == USER_NAMESPACE_PICK);
4051         assert(arg_uid_range == 0x10000U);
4052 
4053         candidate = *shift;
4054 
4055         (void) mkdir("/run/systemd/nspawn-uid", 0755);
4056 
4057         for (;;) {
4058                 char lock_path[STRLEN("/run/systemd/nspawn-uid/") + DECIMAL_STR_MAX(uid_t) + 1];
4059                 _cleanup_(release_lock_file) LockFile lf = LOCK_FILE_INIT;
4060 
4061                 if (--n_tries <= 0)
4062                         return -EBUSY;
4063 
4064                 if (candidate < CONTAINER_UID_BASE_MIN || candidate > CONTAINER_UID_BASE_MAX)
4065                         goto next;
4066                 if ((candidate & UINT32_C(0xFFFF)) != 0)
4067                         goto next;
4068 
4069                 xsprintf(lock_path, "/run/systemd/nspawn-uid/" UID_FMT, candidate);
4070                 r = make_lock_file(lock_path, LOCK_EX|LOCK_NB, &lf);
4071                 if (r == -EBUSY) /* Range already taken by another nspawn instance */
4072                         goto next;
4073                 if (r < 0)
4074                         return r;
4075 
4076                 /* Make some superficial checks whether the range is currently known in the user database */
4077                 if (getpwuid(candidate))
4078                         goto next;
4079                 if (getpwuid(candidate + UINT32_C(0xFFFE)))
4080                         goto next;
4081                 if (getgrgid(candidate))
4082                         goto next;
4083                 if (getgrgid(candidate + UINT32_C(0xFFFE)))
4084                         goto next;
4085 
4086                 *ret_lock_file = lf;
4087                 lf = (struct LockFile) LOCK_FILE_INIT;
4088                 *shift = candidate;
4089                 return 0;
4090 
4091         next:
4092                 if (arg_machine && !tried_hashed) {
4093                         /* Try to hash the base from the container name */
4094 
4095                         static const uint8_t hash_key[] = {
4096                                 0xe1, 0x56, 0xe0, 0xf0, 0x4a, 0xf0, 0x41, 0xaf,
4097                                 0x96, 0x41, 0xcf, 0x41, 0x33, 0x94, 0xff, 0x72
4098                         };
4099 
4100                         candidate = (uid_t) siphash24(arg_machine, strlen(arg_machine), hash_key);
4101 
4102                         tried_hashed = true;
4103                 } else
4104                         random_bytes(&candidate, sizeof(candidate));
4105 
4106                 candidate = (candidate % (CONTAINER_UID_BASE_MAX - CONTAINER_UID_BASE_MIN)) + CONTAINER_UID_BASE_MIN;
4107                 candidate &= (uid_t) UINT32_C(0xFFFF0000);
4108         }
4109 }
4110 
add_one_uid_map(char ** p,uid_t container_uid,uid_t host_uid,uid_t range)4111 static int add_one_uid_map(
4112                 char **p,
4113                 uid_t container_uid,
4114                 uid_t host_uid,
4115                 uid_t range) {
4116 
4117         return strextendf(p,
4118                        UID_FMT " " UID_FMT " " UID_FMT "\n",
4119                        container_uid, host_uid, range);
4120 }
4121 
make_uid_map_string(const uid_t bind_user_uid[],size_t n_bind_user_uid,size_t offset,char ** ret)4122 static int make_uid_map_string(
4123                 const uid_t bind_user_uid[],
4124                 size_t n_bind_user_uid,
4125                 size_t offset,
4126                 char **ret) {
4127 
4128         _cleanup_free_ char *s = NULL;
4129         uid_t previous_uid = 0;
4130         int r;
4131 
4132         assert(n_bind_user_uid == 0 || bind_user_uid);
4133         assert(IN_SET(offset, 0, 2)); /* used to switch between UID and GID map */
4134         assert(ret);
4135 
4136         /* The bind_user_uid[] array is a series of 4 uid_t values, for each --bind-user= entry one
4137          * quadruplet, consisting of host and container UID + GID. */
4138 
4139         for (size_t i = 0; i < n_bind_user_uid; i++) {
4140                 uid_t payload_uid = bind_user_uid[i*2+offset],
4141                         host_uid = bind_user_uid[i*2+offset+1];
4142 
4143                 assert(previous_uid <= payload_uid);
4144                 assert(payload_uid < arg_uid_range);
4145 
4146                 /* Add a range to close the gap to previous entry */
4147                 if (payload_uid > previous_uid) {
4148                         r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, payload_uid - previous_uid);
4149                         if (r < 0)
4150                                 return r;
4151                 }
4152 
4153                 /* Map this specific user */
4154                 r = add_one_uid_map(&s, payload_uid, host_uid, 1);
4155                 if (r < 0)
4156                         return r;
4157 
4158                 previous_uid = payload_uid + 1;
4159         }
4160 
4161         /* And add a range to close the gap to finish the range */
4162         if (arg_uid_range > previous_uid) {
4163                 r = add_one_uid_map(&s, previous_uid, arg_uid_shift + previous_uid, arg_uid_range - previous_uid);
4164                 if (r < 0)
4165                         return r;
4166         }
4167 
4168         assert(s);
4169 
4170         *ret = TAKE_PTR(s);
4171         return 0;
4172 }
4173 
setup_uid_map(pid_t pid,const uid_t bind_user_uid[],size_t n_bind_user_uid)4174 static int setup_uid_map(
4175                 pid_t pid,
4176                 const uid_t bind_user_uid[],
4177                 size_t n_bind_user_uid) {
4178 
4179         char uid_map[STRLEN("/proc//uid_map") + DECIMAL_STR_MAX(uid_t) + 1];
4180         _cleanup_free_ char *s = NULL;
4181         int r;
4182 
4183         assert(pid > 1);
4184 
4185         /* Build the UID map string */
4186         if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 0, &s) < 0) /* offset=0 contains the UID pair */
4187                 return log_oom();
4188 
4189         xsprintf(uid_map, "/proc/" PID_FMT "/uid_map", pid);
4190         r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4191         if (r < 0)
4192                 return log_error_errno(r, "Failed to write UID map: %m");
4193 
4194         /* And now build the GID map string */
4195         s = mfree(s);
4196         if (make_uid_map_string(bind_user_uid, n_bind_user_uid, 2, &s) < 0) /* offset=2 contains the GID pair */
4197                 return log_oom();
4198 
4199         xsprintf(uid_map, "/proc/" PID_FMT "/gid_map", pid);
4200         r = write_string_file(uid_map, s, WRITE_STRING_FILE_DISABLE_BUFFER);
4201         if (r < 0)
4202                 return log_error_errno(r, "Failed to write GID map: %m");
4203 
4204         return 0;
4205 }
4206 
nspawn_dispatch_notify_fd(sd_event_source * source,int fd,uint32_t revents,void * userdata)4207 static int nspawn_dispatch_notify_fd(sd_event_source *source, int fd, uint32_t revents, void *userdata) {
4208         char buf[NOTIFY_BUFFER_MAX+1];
4209         char *p = NULL;
4210         struct iovec iovec = {
4211                 .iov_base = buf,
4212                 .iov_len = sizeof(buf)-1,
4213         };
4214         CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred)) +
4215                          CMSG_SPACE(sizeof(int) * NOTIFY_FD_MAX)) control;
4216         struct msghdr msghdr = {
4217                 .msg_iov = &iovec,
4218                 .msg_iovlen = 1,
4219                 .msg_control = &control,
4220                 .msg_controllen = sizeof(control),
4221         };
4222         struct ucred *ucred;
4223         ssize_t n;
4224         pid_t inner_child_pid;
4225         _cleanup_strv_free_ char **tags = NULL;
4226         int r;
4227 
4228         assert(userdata);
4229 
4230         inner_child_pid = PTR_TO_PID(userdata);
4231 
4232         if (revents != EPOLLIN) {
4233                 log_warning("Got unexpected poll event for notify fd.");
4234                 return 0;
4235         }
4236 
4237         n = recvmsg_safe(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
4238         if (n < 0) {
4239                 if (ERRNO_IS_TRANSIENT(n))
4240                         return 0;
4241                 if (n == -EXFULL) {
4242                         log_warning("Got message with truncated control data (too many fds sent?), ignoring.");
4243                         return 0;
4244                 }
4245                 return log_warning_errno(n, "Couldn't read notification socket: %m");
4246         }
4247 
4248         cmsg_close_all(&msghdr);
4249 
4250         ucred = CMSG_FIND_DATA(&msghdr, SOL_SOCKET, SCM_CREDENTIALS, struct ucred);
4251         if (!ucred || ucred->pid != inner_child_pid) {
4252                 log_debug("Received notify message without valid credentials. Ignoring.");
4253                 return 0;
4254         }
4255 
4256         if ((size_t) n >= sizeof(buf)) {
4257                 log_warning("Received notify message exceeded maximum size. Ignoring.");
4258                 return 0;
4259         }
4260 
4261         buf[n] = 0;
4262         tags = strv_split(buf, "\n\r");
4263         if (!tags)
4264                 return log_oom();
4265 
4266         if (strv_contains(tags, "READY=1")) {
4267                 r = sd_notify(false, "READY=1\n");
4268                 if (r < 0)
4269                         log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
4270         }
4271 
4272         p = strv_find_startswith(tags, "STATUS=");
4273         if (p)
4274                 (void) sd_notifyf(false, "STATUS=Container running: %s", p);
4275 
4276         return 0;
4277 }
4278 
setup_notify_parent(sd_event * event,int fd,pid_t * inner_child_pid,sd_event_source ** notify_event_source)4279 static int setup_notify_parent(sd_event *event, int fd, pid_t *inner_child_pid, sd_event_source **notify_event_source) {
4280         int r;
4281 
4282         r = sd_event_add_io(event, notify_event_source, fd, EPOLLIN, nspawn_dispatch_notify_fd, inner_child_pid);
4283         if (r < 0)
4284                 return log_error_errno(r, "Failed to allocate notify event source: %m");
4285 
4286         (void) sd_event_source_set_description(*notify_event_source, "nspawn-notify");
4287 
4288         return 0;
4289 }
4290 
merge_settings(Settings * settings,const char * path)4291 static int merge_settings(Settings *settings, const char *path) {
4292         int rl;
4293 
4294         assert(settings);
4295         assert(path);
4296 
4297         /* Copy over bits from the settings, unless they have been explicitly masked by command line switches. Note
4298          * that this steals the fields of the Settings* structure, and hence modifies it. */
4299 
4300         if ((arg_settings_mask & SETTING_START_MODE) == 0 &&
4301             settings->start_mode >= 0) {
4302                 arg_start_mode = settings->start_mode;
4303                 strv_free_and_replace(arg_parameters, settings->parameters);
4304         }
4305 
4306         if ((arg_settings_mask & SETTING_EPHEMERAL) == 0 &&
4307             settings->ephemeral >= 0)
4308                 arg_ephemeral = settings->ephemeral;
4309 
4310         if ((arg_settings_mask & SETTING_DIRECTORY) == 0 &&
4311             settings->root) {
4312 
4313                 if (!arg_settings_trusted)
4314                         log_warning("Ignoring root directory setting, file %s is not trusted.", path);
4315                 else
4316                         free_and_replace(arg_directory, settings->root);
4317         }
4318 
4319         if ((arg_settings_mask & SETTING_PIVOT_ROOT) == 0 &&
4320             settings->pivot_root_new) {
4321                 free_and_replace(arg_pivot_root_new, settings->pivot_root_new);
4322                 free_and_replace(arg_pivot_root_old, settings->pivot_root_old);
4323         }
4324 
4325         if ((arg_settings_mask & SETTING_WORKING_DIRECTORY) == 0 &&
4326             settings->working_directory)
4327                 free_and_replace(arg_chdir, settings->working_directory);
4328 
4329         if ((arg_settings_mask & SETTING_ENVIRONMENT) == 0 &&
4330             settings->environment)
4331                 strv_free_and_replace(arg_setenv, settings->environment);
4332 
4333         if ((arg_settings_mask & SETTING_USER) == 0) {
4334 
4335                 if (settings->user)
4336                         free_and_replace(arg_user, settings->user);
4337 
4338                 if (uid_is_valid(settings->uid))
4339                         arg_uid = settings->uid;
4340                 if (gid_is_valid(settings->gid))
4341                         arg_gid = settings->gid;
4342                 if (settings->n_supplementary_gids > 0) {
4343                         free_and_replace(arg_supplementary_gids, settings->supplementary_gids);
4344                         arg_n_supplementary_gids = settings->n_supplementary_gids;
4345                 }
4346         }
4347 
4348         if ((arg_settings_mask & SETTING_CAPABILITY) == 0) {
4349                 uint64_t plus, minus;
4350                 uint64_t network_minus = 0;
4351                 uint64_t ambient;
4352 
4353                 /* Note that we copy both the simple plus/minus caps here, and the full quintet from the
4354                  * Settings structure */
4355 
4356                 plus = settings->capability;
4357                 minus = settings->drop_capability;
4358 
4359                 if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4360                     settings_network_configured(settings)) {
4361                         if (settings_private_network(settings))
4362                                 plus |= UINT64_C(1) << CAP_NET_ADMIN;
4363                         else
4364                                 network_minus |= UINT64_C(1) << CAP_NET_ADMIN;
4365                 }
4366 
4367                 if (!arg_settings_trusted && plus != 0) {
4368                         if (settings->capability != 0)
4369                                 log_warning("Ignoring Capability= setting, file %s is not trusted.", path);
4370                 } else {
4371                         arg_caps_retain &= ~network_minus;
4372                         arg_caps_retain |= plus;
4373                 }
4374 
4375                 arg_caps_retain &= ~minus;
4376 
4377                 /* Copy the full capabilities over too */
4378                 if (capability_quintet_is_set(&settings->full_capabilities)) {
4379                         if (!arg_settings_trusted)
4380                                 log_warning("Ignoring capability settings, file %s is not trusted.", path);
4381                         else
4382                                 arg_full_capabilities = settings->full_capabilities;
4383                 }
4384 
4385                 ambient = settings->ambient_capability;
4386                 if (!arg_settings_trusted && ambient != 0)
4387                         log_warning("Ignoring AmbientCapability= setting, file %s is not trusted.", path);
4388                 else
4389                         arg_caps_ambient |= ambient;
4390         }
4391 
4392         if ((arg_settings_mask & SETTING_KILL_SIGNAL) == 0 &&
4393             settings->kill_signal > 0)
4394                 arg_kill_signal = settings->kill_signal;
4395 
4396         if ((arg_settings_mask & SETTING_PERSONALITY) == 0 &&
4397             settings->personality != PERSONALITY_INVALID)
4398                 arg_personality = settings->personality;
4399 
4400         if ((arg_settings_mask & SETTING_MACHINE_ID) == 0 &&
4401             !sd_id128_is_null(settings->machine_id)) {
4402 
4403                 if (!arg_settings_trusted)
4404                         log_warning("Ignoring MachineID= setting, file %s is not trusted.", path);
4405                 else
4406                         arg_uuid = settings->machine_id;
4407         }
4408 
4409         if ((arg_settings_mask & SETTING_READ_ONLY) == 0 &&
4410             settings->read_only >= 0)
4411                 arg_read_only = settings->read_only;
4412 
4413         if ((arg_settings_mask & SETTING_VOLATILE_MODE) == 0 &&
4414             settings->volatile_mode != _VOLATILE_MODE_INVALID)
4415                 arg_volatile_mode = settings->volatile_mode;
4416 
4417         if ((arg_settings_mask & SETTING_CUSTOM_MOUNTS) == 0 &&
4418             settings->n_custom_mounts > 0) {
4419 
4420                 if (!arg_settings_trusted)
4421                         log_warning("Ignoring TemporaryFileSystem=, Bind= and BindReadOnly= settings, file %s is not trusted.", path);
4422                 else {
4423                         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
4424                         arg_custom_mounts = TAKE_PTR(settings->custom_mounts);
4425                         arg_n_custom_mounts = settings->n_custom_mounts;
4426                         settings->n_custom_mounts = 0;
4427                 }
4428         }
4429 
4430         if ((arg_settings_mask & SETTING_NETWORK) == 0 &&
4431             settings_network_configured(settings)) {
4432 
4433                 if (!arg_settings_trusted)
4434                         log_warning("Ignoring network settings, file %s is not trusted.", path);
4435                 else {
4436                         arg_network_veth = settings_network_veth(settings);
4437                         arg_private_network = settings_private_network(settings);
4438 
4439                         strv_free_and_replace(arg_network_interfaces, settings->network_interfaces);
4440                         strv_free_and_replace(arg_network_macvlan, settings->network_macvlan);
4441                         strv_free_and_replace(arg_network_ipvlan, settings->network_ipvlan);
4442                         strv_free_and_replace(arg_network_veth_extra, settings->network_veth_extra);
4443 
4444                         free_and_replace(arg_network_bridge, settings->network_bridge);
4445                         free_and_replace(arg_network_zone, settings->network_zone);
4446 
4447                         free_and_replace(arg_network_namespace_path, settings->network_namespace_path);
4448                 }
4449         }
4450 
4451         if ((arg_settings_mask & SETTING_EXPOSE_PORTS) == 0 &&
4452             settings->expose_ports) {
4453 
4454                 if (!arg_settings_trusted)
4455                         log_warning("Ignoring Port= setting, file %s is not trusted.", path);
4456                 else {
4457                         expose_port_free_all(arg_expose_ports);
4458                         arg_expose_ports = TAKE_PTR(settings->expose_ports);
4459                 }
4460         }
4461 
4462         if ((arg_settings_mask & SETTING_USERNS) == 0 &&
4463             settings->userns_mode != _USER_NAMESPACE_MODE_INVALID) {
4464 
4465                 if (!arg_settings_trusted)
4466                         log_warning("Ignoring PrivateUsers= and PrivateUsersChown= settings, file %s is not trusted.", path);
4467                 else {
4468                         arg_userns_mode = settings->userns_mode;
4469                         arg_uid_shift = settings->uid_shift;
4470                         arg_uid_range = settings->uid_range;
4471                         arg_userns_ownership = settings->userns_ownership;
4472                 }
4473         }
4474 
4475         if ((arg_settings_mask & SETTING_BIND_USER) == 0 &&
4476             !strv_isempty(settings->bind_user))
4477                 strv_free_and_replace(arg_bind_user, settings->bind_user);
4478 
4479         if ((arg_settings_mask & SETTING_NOTIFY_READY) == 0 &&
4480             settings->notify_ready >= 0)
4481                 arg_notify_ready = settings->notify_ready;
4482 
4483         if ((arg_settings_mask & SETTING_SYSCALL_FILTER) == 0) {
4484 
4485                 if (!strv_isempty(settings->syscall_allow_list) || !strv_isempty(settings->syscall_deny_list)) {
4486                         if (!arg_settings_trusted && !strv_isempty(settings->syscall_allow_list))
4487                                 log_warning("Ignoring SystemCallFilter= settings, file %s is not trusted.", path);
4488                         else {
4489                                 strv_free_and_replace(arg_syscall_allow_list, settings->syscall_allow_list);
4490                                 strv_free_and_replace(arg_syscall_deny_list, settings->syscall_deny_list);
4491                         }
4492                 }
4493 
4494 #if HAVE_SECCOMP
4495                 if (settings->seccomp) {
4496                         if (!arg_settings_trusted)
4497                                 log_warning("Ignoring SECCOMP filter, file %s is not trusted.", path);
4498                         else {
4499                                 seccomp_release(arg_seccomp);
4500                                 arg_seccomp = TAKE_PTR(settings->seccomp);
4501                         }
4502                 }
4503 #endif
4504         }
4505 
4506         for (rl = 0; rl < _RLIMIT_MAX; rl ++) {
4507                 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)))
4508                         continue;
4509 
4510                 if (!settings->rlimit[rl])
4511                         continue;
4512 
4513                 if (!arg_settings_trusted) {
4514                         log_warning("Ignoring Limit%s= setting, file '%s' is not trusted.", rlimit_to_string(rl), path);
4515                         continue;
4516                 }
4517 
4518                 free_and_replace(arg_rlimit[rl], settings->rlimit[rl]);
4519         }
4520 
4521         if ((arg_settings_mask & SETTING_HOSTNAME) == 0 &&
4522             settings->hostname)
4523                 free_and_replace(arg_hostname, settings->hostname);
4524 
4525         if ((arg_settings_mask & SETTING_NO_NEW_PRIVILEGES) == 0 &&
4526             settings->no_new_privileges >= 0)
4527                 arg_no_new_privileges = settings->no_new_privileges;
4528 
4529         if ((arg_settings_mask & SETTING_OOM_SCORE_ADJUST) == 0 &&
4530             settings->oom_score_adjust_set) {
4531 
4532                 if (!arg_settings_trusted)
4533                         log_warning("Ignoring OOMScoreAdjust= setting, file '%s' is not trusted.", path);
4534                 else {
4535                         arg_oom_score_adjust = settings->oom_score_adjust;
4536                         arg_oom_score_adjust_set = true;
4537                 }
4538         }
4539 
4540         if ((arg_settings_mask & SETTING_CPU_AFFINITY) == 0 &&
4541             settings->cpu_set.set) {
4542 
4543                 if (!arg_settings_trusted)
4544                         log_warning("Ignoring CPUAffinity= setting, file '%s' is not trusted.", path);
4545                 else {
4546                         cpu_set_reset(&arg_cpu_set);
4547                         arg_cpu_set = settings->cpu_set;
4548                         settings->cpu_set = (CPUSet) {};
4549                 }
4550         }
4551 
4552         if ((arg_settings_mask & SETTING_RESOLV_CONF) == 0 &&
4553             settings->resolv_conf != _RESOLV_CONF_MODE_INVALID)
4554                 arg_resolv_conf = settings->resolv_conf;
4555 
4556         if ((arg_settings_mask & SETTING_LINK_JOURNAL) == 0 &&
4557             settings->link_journal != _LINK_JOURNAL_INVALID) {
4558 
4559                 if (!arg_settings_trusted)
4560                         log_warning("Ignoring journal link setting, file '%s' is not trusted.", path);
4561                 else {
4562                         arg_link_journal = settings->link_journal;
4563                         arg_link_journal_try = settings->link_journal_try;
4564                 }
4565         }
4566 
4567         if ((arg_settings_mask & SETTING_TIMEZONE) == 0 &&
4568             settings->timezone != _TIMEZONE_MODE_INVALID)
4569                 arg_timezone = settings->timezone;
4570 
4571         if ((arg_settings_mask & SETTING_SLICE) == 0 &&
4572             settings->slice) {
4573 
4574                 if (!arg_settings_trusted)
4575                         log_warning("Ignoring slice setting, file '%s' is not trusted.", path);
4576                 else
4577                         free_and_replace(arg_slice, settings->slice);
4578         }
4579 
4580         if ((arg_settings_mask & SETTING_USE_CGNS) == 0 &&
4581             settings->use_cgns >= 0) {
4582 
4583                 if (!arg_settings_trusted)
4584                         log_warning("Ignoring cgroup namespace setting, file '%s' is not trusted.", path);
4585                 else
4586                         arg_use_cgns = settings->use_cgns;
4587         }
4588 
4589         if ((arg_settings_mask & SETTING_CLONE_NS_FLAGS) == 0 &&
4590             settings->clone_ns_flags != ULONG_MAX) {
4591 
4592                 if (!arg_settings_trusted)
4593                         log_warning("Ignoring namespace setting, file '%s' is not trusted.", path);
4594                 else
4595                         arg_clone_ns_flags = settings->clone_ns_flags;
4596         }
4597 
4598         if ((arg_settings_mask & SETTING_CONSOLE_MODE) == 0 &&
4599             settings->console_mode >= 0) {
4600 
4601                 if (!arg_settings_trusted)
4602                         log_warning("Ignoring console mode setting, file '%s' is not trusted.", path);
4603                 else
4604                         arg_console_mode = settings->console_mode;
4605         }
4606 
4607         if ((arg_settings_mask & SETTING_SUPPRESS_SYNC) == 0 &&
4608             settings->suppress_sync >= 0)
4609                 arg_suppress_sync = settings->suppress_sync;
4610 
4611         /* The following properties can only be set through the OCI settings logic, not from the command line, hence we
4612          * don't consult arg_settings_mask for them. */
4613 
4614         sd_bus_message_unref(arg_property_message);
4615         arg_property_message = TAKE_PTR(settings->properties);
4616 
4617         arg_console_width = settings->console_width;
4618         arg_console_height = settings->console_height;
4619 
4620         device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
4621         arg_extra_nodes = TAKE_PTR(settings->extra_nodes);
4622         arg_n_extra_nodes = settings->n_extra_nodes;
4623 
4624         return 0;
4625 }
4626 
load_settings(void)4627 static int load_settings(void) {
4628         _cleanup_(settings_freep) Settings *settings = NULL;
4629         _cleanup_fclose_ FILE *f = NULL;
4630         _cleanup_free_ char *p = NULL;
4631         int r;
4632 
4633         if (arg_oci_bundle)
4634                 return 0;
4635 
4636         /* If all settings are masked, there's no point in looking for
4637          * the settings file */
4638         if (FLAGS_SET(arg_settings_mask, _SETTINGS_MASK_ALL))
4639                 return 0;
4640 
4641         /* We first look in the admin's directories in /etc and /run */
4642         FOREACH_STRING(i, "/etc/systemd/nspawn", "/run/systemd/nspawn") {
4643                 _cleanup_free_ char *j = NULL;
4644 
4645                 j = path_join(i, arg_settings_filename);
4646                 if (!j)
4647                         return log_oom();
4648 
4649                 f = fopen(j, "re");
4650                 if (f) {
4651                         p = TAKE_PTR(j);
4652 
4653                         /* By default, we trust configuration from /etc and /run */
4654                         if (arg_settings_trusted < 0)
4655                                 arg_settings_trusted = true;
4656 
4657                         break;
4658                 }
4659 
4660                 if (errno != ENOENT)
4661                         return log_error_errno(errno, "Failed to open %s: %m", j);
4662         }
4663 
4664         if (!f) {
4665                 /* After that, let's look for a file next to the
4666                  * actual image we shall boot. */
4667 
4668                 if (arg_image) {
4669                         p = file_in_same_dir(arg_image, arg_settings_filename);
4670                         if (!p)
4671                                 return log_oom();
4672                 } else if (arg_directory && !path_equal(arg_directory, "/")) {
4673                         p = file_in_same_dir(arg_directory, arg_settings_filename);
4674                         if (!p)
4675                                 return log_oom();
4676                 }
4677 
4678                 if (p) {
4679                         f = fopen(p, "re");
4680                         if (!f && errno != ENOENT)
4681                                 return log_error_errno(errno, "Failed to open %s: %m", p);
4682 
4683                         /* By default, we do not trust configuration from /var/lib/machines */
4684                         if (arg_settings_trusted < 0)
4685                                 arg_settings_trusted = false;
4686                 }
4687         }
4688 
4689         if (!f)
4690                 return 0;
4691 
4692         log_debug("Settings are trusted: %s", yes_no(arg_settings_trusted));
4693 
4694         r = settings_load(f, p, &settings);
4695         if (r < 0)
4696                 return r;
4697 
4698         return merge_settings(settings, p);
4699 }
4700 
load_oci_bundle(void)4701 static int load_oci_bundle(void) {
4702         _cleanup_(settings_freep) Settings *settings = NULL;
4703         int r;
4704 
4705         if (!arg_oci_bundle)
4706                 return 0;
4707 
4708         /* By default let's trust OCI bundles */
4709         if (arg_settings_trusted < 0)
4710                 arg_settings_trusted = true;
4711 
4712         r = oci_load(NULL, arg_oci_bundle, &settings);
4713         if (r < 0)
4714                 return r;
4715 
4716         return merge_settings(settings, arg_oci_bundle);
4717 }
4718 
run_container(DissectedImage * dissected_image,bool secondary,FDSet * fds,char veth_name[IFNAMSIZ],bool * veth_created,struct ExposeArgs * expose_args,int * master,pid_t * pid,int * ret)4719 static int run_container(
4720                DissectedImage *dissected_image,
4721                bool secondary,
4722                FDSet *fds,
4723                char veth_name[IFNAMSIZ], bool *veth_created,
4724                struct ExposeArgs *expose_args,
4725                int *master, pid_t *pid, int *ret) {
4726 
4727         static const struct sigaction sa = {
4728                 .sa_handler = nop_signal_handler,
4729                 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
4730         };
4731 
4732         _cleanup_(release_lock_file) LockFile uid_shift_lock = LOCK_FILE_INIT;
4733         _cleanup_close_ int etc_passwd_lock = -1;
4734         _cleanup_close_pair_ int
4735                 kmsg_socket_pair[2] = { -1, -1 },
4736                 rtnl_socket_pair[2] = { -1, -1 },
4737                 pid_socket_pair[2] = { -1, -1 },
4738                 uuid_socket_pair[2] = { -1, -1 },
4739                 notify_socket_pair[2] = { -1, -1 },
4740                 uid_shift_socket_pair[2] = { -1, -1 },
4741                 master_pty_socket_pair[2] = { -1, -1 },
4742                 unified_cgroup_hierarchy_socket_pair[2] = { -1, -1};
4743 
4744         _cleanup_close_ int notify_socket = -1;
4745         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
4746         _cleanup_(sd_event_source_unrefp) sd_event_source *notify_event_source = NULL;
4747         _cleanup_(sd_event_unrefp) sd_event *event = NULL;
4748         _cleanup_(pty_forward_freep) PTYForward *forward = NULL;
4749         _cleanup_(sd_netlink_unrefp) sd_netlink *rtnl = NULL;
4750         _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
4751         _cleanup_free_ uid_t *bind_user_uid = NULL;
4752         size_t n_bind_user_uid = 0;
4753         ContainerStatus container_status = 0;
4754         int ifi = 0, r;
4755         ssize_t l;
4756         sigset_t mask_chld;
4757         _cleanup_close_ int child_netns_fd = -1;
4758 
4759         assert_se(sigemptyset(&mask_chld) == 0);
4760         assert_se(sigaddset(&mask_chld, SIGCHLD) == 0);
4761 
4762         if (arg_userns_mode == USER_NAMESPACE_PICK) {
4763                 /* When we shall pick the UID/GID range, let's first lock /etc/passwd, so that we can safely
4764                  * check with getpwuid() if the specific user already exists. Note that /etc might be
4765                  * read-only, in which case this will fail with EROFS. But that's really OK, as in that case we
4766                  * can be reasonably sure that no users are going to be added. Note that getpwuid() checks are
4767                  * really just an extra safety net. We kinda assume that the UID range we allocate from is
4768                  * really ours. */
4769 
4770                 etc_passwd_lock = take_etc_passwd_lock(NULL);
4771                 if (etc_passwd_lock < 0 && etc_passwd_lock != -EROFS)
4772                         return log_error_errno(etc_passwd_lock, "Failed to take /etc/passwd lock: %m");
4773         }
4774 
4775         r = barrier_create(&barrier);
4776         if (r < 0)
4777                 return log_error_errno(r, "Cannot initialize IPC barrier: %m");
4778 
4779         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, kmsg_socket_pair) < 0)
4780                 return log_error_errno(errno, "Failed to create kmsg socket pair: %m");
4781 
4782         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, rtnl_socket_pair) < 0)
4783                 return log_error_errno(errno, "Failed to create rtnl socket pair: %m");
4784 
4785         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, pid_socket_pair) < 0)
4786                 return log_error_errno(errno, "Failed to create pid socket pair: %m");
4787 
4788         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uuid_socket_pair) < 0)
4789                 return log_error_errno(errno, "Failed to create id socket pair: %m");
4790 
4791         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, notify_socket_pair) < 0)
4792                 return log_error_errno(errno, "Failed to create notify socket pair: %m");
4793 
4794         if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, master_pty_socket_pair) < 0)
4795                 return log_error_errno(errno, "Failed to create console socket pair: %m");
4796 
4797         if (arg_userns_mode != USER_NAMESPACE_NO)
4798                 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, uid_shift_socket_pair) < 0)
4799                         return log_error_errno(errno, "Failed to create uid shift socket pair: %m");
4800 
4801         if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN)
4802                 if (socketpair(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0, unified_cgroup_hierarchy_socket_pair) < 0)
4803                         return log_error_errno(errno, "Failed to create unified cgroup socket pair: %m");
4804 
4805         /* Child can be killed before execv(), so handle SIGCHLD in order to interrupt
4806          * parent's blocking calls and give it a chance to call wait() and terminate. */
4807         r = sigprocmask(SIG_UNBLOCK, &mask_chld, NULL);
4808         if (r < 0)
4809                 return log_error_errno(errno, "Failed to change the signal mask: %m");
4810 
4811         r = sigaction(SIGCHLD, &sa, NULL);
4812         if (r < 0)
4813                 return log_error_errno(errno, "Failed to install SIGCHLD handler: %m");
4814 
4815         if (arg_network_namespace_path) {
4816                 child_netns_fd = open(arg_network_namespace_path, O_RDONLY|O_NOCTTY|O_CLOEXEC);
4817                 if (child_netns_fd < 0)
4818                         return log_error_errno(errno, "Cannot open file %s: %m", arg_network_namespace_path);
4819 
4820                 r = fd_is_ns(child_netns_fd, CLONE_NEWNET);
4821                 if (r == -EUCLEAN)
4822                         log_debug_errno(r, "Cannot determine if passed network namespace path '%s' really refers to a network namespace, assuming it does.", arg_network_namespace_path);
4823                 else if (r < 0)
4824                         return log_error_errno(r, "Failed to check %s fs type: %m", arg_network_namespace_path);
4825                 else if (r == 0)
4826                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
4827                                                "Path %s doesn't refer to a network namespace, refusing.", arg_network_namespace_path);
4828         }
4829 
4830         *pid = raw_clone(SIGCHLD|CLONE_NEWNS);
4831         if (*pid < 0)
4832                 return log_error_errno(errno, "clone() failed%s: %m",
4833                                        errno == EINVAL ?
4834                                        ", do you have namespace support enabled in your kernel? (You need UTS, IPC, PID and NET namespacing built in)" : "");
4835 
4836         if (*pid == 0) {
4837                 /* The outer child only has a file system namespace. */
4838                 barrier_set_role(&barrier, BARRIER_CHILD);
4839 
4840                 kmsg_socket_pair[0] = safe_close(kmsg_socket_pair[0]);
4841                 rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
4842                 pid_socket_pair[0] = safe_close(pid_socket_pair[0]);
4843                 uuid_socket_pair[0] = safe_close(uuid_socket_pair[0]);
4844                 notify_socket_pair[0] = safe_close(notify_socket_pair[0]);
4845                 master_pty_socket_pair[0] = safe_close(master_pty_socket_pair[0]);
4846                 uid_shift_socket_pair[0] = safe_close(uid_shift_socket_pair[0]);
4847                 unified_cgroup_hierarchy_socket_pair[0] = safe_close(unified_cgroup_hierarchy_socket_pair[0]);
4848 
4849                 (void) reset_all_signal_handlers();
4850                 (void) reset_signal_mask();
4851 
4852                 r = outer_child(&barrier,
4853                                 arg_directory,
4854                                 dissected_image,
4855                                 secondary,
4856                                 pid_socket_pair[1],
4857                                 uuid_socket_pair[1],
4858                                 notify_socket_pair[1],
4859                                 kmsg_socket_pair[1],
4860                                 rtnl_socket_pair[1],
4861                                 uid_shift_socket_pair[1],
4862                                 master_pty_socket_pair[1],
4863                                 unified_cgroup_hierarchy_socket_pair[1],
4864                                 fds,
4865                                 child_netns_fd);
4866                 if (r < 0)
4867                         _exit(EXIT_FAILURE);
4868 
4869                 _exit(EXIT_SUCCESS);
4870         }
4871 
4872         barrier_set_role(&barrier, BARRIER_PARENT);
4873 
4874         fdset_close(fds);
4875 
4876         kmsg_socket_pair[1] = safe_close(kmsg_socket_pair[1]);
4877         rtnl_socket_pair[1] = safe_close(rtnl_socket_pair[1]);
4878         pid_socket_pair[1] = safe_close(pid_socket_pair[1]);
4879         uuid_socket_pair[1] = safe_close(uuid_socket_pair[1]);
4880         notify_socket_pair[1] = safe_close(notify_socket_pair[1]);
4881         master_pty_socket_pair[1] = safe_close(master_pty_socket_pair[1]);
4882         uid_shift_socket_pair[1] = safe_close(uid_shift_socket_pair[1]);
4883         unified_cgroup_hierarchy_socket_pair[1] = safe_close(unified_cgroup_hierarchy_socket_pair[1]);
4884 
4885         if (arg_userns_mode != USER_NAMESPACE_NO) {
4886                 /* The child just let us know the UID shift it might have read from the image. */
4887                 l = recv(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, 0);
4888                 if (l < 0)
4889                         return log_error_errno(errno, "Failed to read UID shift: %m");
4890                 if (l != sizeof arg_uid_shift)
4891                         return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading UID shift.");
4892 
4893                 if (arg_userns_mode == USER_NAMESPACE_PICK) {
4894                         /* If we are supposed to pick the UID shift, let's try to use the shift read from the
4895                          * image, but if that's already in use, pick a new one, and report back to the child,
4896                          * which one we now picked. */
4897 
4898                         r = uid_shift_pick(&arg_uid_shift, &uid_shift_lock);
4899                         if (r < 0)
4900                                 return log_error_errno(r, "Failed to pick suitable UID/GID range: %m");
4901 
4902                         l = send(uid_shift_socket_pair[0], &arg_uid_shift, sizeof arg_uid_shift, MSG_NOSIGNAL);
4903                         if (l < 0)
4904                                 return log_error_errno(errno, "Failed to send UID shift: %m");
4905                         if (l != sizeof arg_uid_shift)
4906                                 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short write while writing UID shift.");
4907                 }
4908 
4909                 n_bind_user_uid = strv_length(arg_bind_user);
4910                 if (n_bind_user_uid > 0) {
4911                         /* Right after the UID shift, we'll receive the list of UID mappings for the
4912                          * --bind-user= logic. Always a quadruplet of payload and host UID + GID. */
4913 
4914                         bind_user_uid = new(uid_t, n_bind_user_uid*4);
4915                         if (!bind_user_uid)
4916                                 return log_oom();
4917 
4918                         for (size_t i = 0; i < n_bind_user_uid; i++) {
4919                                 l = recv(uid_shift_socket_pair[0], bind_user_uid + i*4, sizeof(uid_t)*4, 0);
4920                                 if (l < 0)
4921                                         return log_error_errno(errno, "Failed to read user UID map pair: %m");
4922                                 if (l != sizeof(uid_t)*4)
4923                                         return log_full_errno(l == 0 ? LOG_DEBUG : LOG_WARNING,
4924                                                               SYNTHETIC_ERRNO(EIO),
4925                                                               "Short read while reading bind user UID pairs.");
4926                         }
4927                 }
4928         }
4929 
4930         if (arg_unified_cgroup_hierarchy == CGROUP_UNIFIED_UNKNOWN) {
4931                 /* The child let us know the support cgroup mode it might have read from the image. */
4932                 l = recv(unified_cgroup_hierarchy_socket_pair[0], &arg_unified_cgroup_hierarchy, sizeof(arg_unified_cgroup_hierarchy), 0);
4933                 if (l < 0)
4934                         return log_error_errno(errno, "Failed to read cgroup mode: %m");
4935                 if (l != sizeof(arg_unified_cgroup_hierarchy))
4936                         return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading cgroup mode (%zu bytes).%s",
4937                                                l, l == 0 ? " The child is most likely dead." : "");
4938         }
4939 
4940         /* Wait for the outer child. */
4941         r = wait_for_terminate_and_check("(sd-namespace)", *pid, WAIT_LOG_ABNORMAL);
4942         if (r < 0)
4943                 return r;
4944         if (r != EXIT_SUCCESS)
4945                 return -EIO;
4946 
4947         /* And now retrieve the PID of the inner child. */
4948         l = recv(pid_socket_pair[0], pid, sizeof *pid, 0);
4949         if (l < 0)
4950                 return log_error_errno(errno, "Failed to read inner child PID: %m");
4951         if (l != sizeof *pid)
4952                 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading inner child PID.");
4953 
4954         /* We also retrieve container UUID in case it was generated by outer child */
4955         l = recv(uuid_socket_pair[0], &arg_uuid, sizeof arg_uuid, 0);
4956         if (l < 0)
4957                 return log_error_errno(errno, "Failed to read container machine ID: %m");
4958         if (l != sizeof(arg_uuid))
4959                 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Short read while reading container machined ID.");
4960 
4961         /* We also retrieve the socket used for notifications generated by outer child */
4962         notify_socket = receive_one_fd(notify_socket_pair[0], 0);
4963         if (notify_socket < 0)
4964                 return log_error_errno(notify_socket,
4965                                        "Failed to receive notification socket from the outer child: %m");
4966 
4967         log_debug("Init process invoked as PID "PID_FMT, *pid);
4968 
4969         if (arg_userns_mode != USER_NAMESPACE_NO) {
4970                 if (!barrier_place_and_sync(&barrier)) /* #1 */
4971                         return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
4972 
4973                 r = setup_uid_map(*pid, bind_user_uid, n_bind_user_uid);
4974                 if (r < 0)
4975                         return r;
4976 
4977                 (void) barrier_place(&barrier); /* #2 */
4978         }
4979 
4980         if (arg_private_network) {
4981                 if (!arg_network_namespace_path) {
4982                         /* Wait until the child has unshared its network namespace. */
4983                         if (!barrier_place_and_sync(&barrier)) /* #3 */
4984                                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early");
4985                 }
4986 
4987                 if (child_netns_fd < 0) {
4988                         /* Make sure we have an open file descriptor to the child's network
4989                          * namespace so it stays alive even if the child exits. */
4990                         r = namespace_open(*pid, NULL, NULL, &child_netns_fd, NULL, NULL);
4991                         if (r < 0)
4992                                 return log_error_errno(r, "Failed to open child network namespace: %m");
4993                 }
4994 
4995                 r = move_network_interfaces(child_netns_fd, arg_network_interfaces);
4996                 if (r < 0)
4997                         return r;
4998 
4999                 if (arg_network_veth) {
5000                         r = setup_veth(arg_machine, *pid, veth_name,
5001                                        arg_network_bridge || arg_network_zone);
5002                         if (r < 0)
5003                                 return r;
5004                         else if (r > 0)
5005                                 ifi = r;
5006 
5007                         if (arg_network_bridge) {
5008                                 /* Add the interface to a bridge */
5009                                 r = setup_bridge(veth_name, arg_network_bridge, false);
5010                                 if (r < 0)
5011                                         return r;
5012                                 if (r > 0)
5013                                         ifi = r;
5014                         } else if (arg_network_zone) {
5015                                 /* Add the interface to a bridge, possibly creating it */
5016                                 r = setup_bridge(veth_name, arg_network_zone, true);
5017                                 if (r < 0)
5018                                         return r;
5019                                 if (r > 0)
5020                                         ifi = r;
5021                         }
5022                 }
5023 
5024                 r = setup_veth_extra(arg_machine, *pid, arg_network_veth_extra);
5025                 if (r < 0)
5026                         return r;
5027 
5028                 /* We created the primary and extra veth links now; let's remember this, so that we know to
5029                    remove them later on. Note that we don't bother with removing veth links that were created
5030                    here when their setup failed half-way, because in that case the kernel should be able to
5031                    remove them on its own, since they cannot be referenced by anything yet. */
5032                 *veth_created = true;
5033 
5034                 r = setup_macvlan(arg_machine, *pid, arg_network_macvlan);
5035                 if (r < 0)
5036                         return r;
5037 
5038                 r = setup_ipvlan(arg_machine, *pid, arg_network_ipvlan);
5039                 if (r < 0)
5040                         return r;
5041         }
5042 
5043         if (arg_register || !arg_keep_unit) {
5044                 r = sd_bus_default_system(&bus);
5045                 if (r < 0)
5046                         return log_error_errno(r, "Failed to open system bus: %m");
5047 
5048                 r = sd_bus_set_close_on_exit(bus, false);
5049                 if (r < 0)
5050                         return log_error_errno(r, "Failed to disable close-on-exit behaviour: %m");
5051         }
5052 
5053         if (!arg_keep_unit) {
5054                 /* When a new scope is created for this container, then we'll be registered as its controller, in which
5055                  * case PID 1 will send us a friendly RequestStop signal, when it is asked to terminate the
5056                  * scope. Let's hook into that, and cleanly shut down the container, and print a friendly message. */
5057 
5058                 r = sd_bus_match_signal_async(
5059                                 bus,
5060                                 NULL,
5061                                 "org.freedesktop.systemd1",
5062                                 NULL,
5063                                 "org.freedesktop.systemd1.Scope",
5064                                 "RequestStop",
5065                                 on_request_stop, NULL, PID_TO_PTR(*pid));
5066                 if (r < 0)
5067                         return log_error_errno(r, "Failed to request RequestStop match: %m");
5068         }
5069 
5070         if (arg_register) {
5071                 r = register_machine(
5072                                 bus,
5073                                 arg_machine,
5074                                 *pid,
5075                                 arg_directory,
5076                                 arg_uuid,
5077                                 ifi,
5078                                 arg_slice,
5079                                 arg_custom_mounts, arg_n_custom_mounts,
5080                                 arg_kill_signal,
5081                                 arg_property,
5082                                 arg_property_message,
5083                                 arg_keep_unit,
5084                                 arg_container_service_name);
5085                 if (r < 0)
5086                         return r;
5087 
5088         } else if (!arg_keep_unit) {
5089                 r = allocate_scope(
5090                                 bus,
5091                                 arg_machine,
5092                                 *pid,
5093                                 arg_slice,
5094                                 arg_custom_mounts, arg_n_custom_mounts,
5095                                 arg_kill_signal,
5096                                 arg_property,
5097                                 arg_property_message);
5098                 if (r < 0)
5099                         return r;
5100 
5101         } else if (arg_slice || arg_property)
5102                 log_notice("Machine and scope registration turned off, --slice= and --property= settings will have no effect.");
5103 
5104         r = create_subcgroup(*pid, arg_keep_unit, arg_unified_cgroup_hierarchy);
5105         if (r < 0)
5106                 return r;
5107 
5108         r = sync_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5109         if (r < 0)
5110                 return r;
5111 
5112         r = chown_cgroup(*pid, arg_unified_cgroup_hierarchy, arg_uid_shift);
5113         if (r < 0)
5114                 return r;
5115 
5116         /* Notify the child that the parent is ready with all
5117          * its setup (including cgroup-ification), and that
5118          * the child can now hand over control to the code to
5119          * run inside the container. */
5120         (void) barrier_place(&barrier); /* #4 */
5121 
5122         /* Block SIGCHLD here, before notifying child.
5123          * process_pty() will handle it with the other signals. */
5124         assert_se(sigprocmask(SIG_BLOCK, &mask_chld, NULL) >= 0);
5125 
5126         /* Reset signal to default */
5127         r = default_signals(SIGCHLD);
5128         if (r < 0)
5129                 return log_error_errno(r, "Failed to reset SIGCHLD: %m");
5130 
5131         r = sd_event_new(&event);
5132         if (r < 0)
5133                 return log_error_errno(r, "Failed to get default event source: %m");
5134 
5135         (void) sd_event_set_watchdog(event, true);
5136 
5137         if (bus) {
5138                 r = sd_bus_attach_event(bus, event, 0);
5139                 if (r < 0)
5140                         return log_error_errno(r, "Failed to attach bus to event loop: %m");
5141         }
5142 
5143         r = setup_notify_parent(event, notify_socket, PID_TO_PTR(*pid), &notify_event_source);
5144         if (r < 0)
5145                 return r;
5146 
5147         /* Let the child know that we are ready and wait that the child is completely ready now. */
5148         if (!barrier_place_and_sync(&barrier)) /* #5 */
5149                 return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "Child died too early.");
5150 
5151         /* At this point we have made use of the UID we picked, and thus nss-systemd/systemd-machined.service
5152          * will make them appear in getpwuid(), thus we can release the /etc/passwd lock. */
5153         etc_passwd_lock = safe_close(etc_passwd_lock);
5154 
5155         (void) sd_notifyf(false,
5156                           "STATUS=Container running.\n"
5157                           "X_NSPAWN_LEADER_PID=" PID_FMT, *pid);
5158         if (!arg_notify_ready) {
5159                 r = sd_notify(false, "READY=1\n");
5160                 if (r < 0)
5161                         log_warning_errno(r, "Failed to send readiness notification, ignoring: %m");
5162         }
5163 
5164         if (arg_kill_signal > 0) {
5165                 /* Try to kill the init system on SIGINT or SIGTERM */
5166                 (void) sd_event_add_signal(event, NULL, SIGINT, on_orderly_shutdown, PID_TO_PTR(*pid));
5167                 (void) sd_event_add_signal(event, NULL, SIGTERM, on_orderly_shutdown, PID_TO_PTR(*pid));
5168         } else {
5169                 /* Immediately exit */
5170                 (void) sd_event_add_signal(event, NULL, SIGINT, NULL, NULL);
5171                 (void) sd_event_add_signal(event, NULL, SIGTERM, NULL, NULL);
5172         }
5173 
5174         /* Exit when the child exits */
5175         (void) sd_event_add_signal(event, NULL, SIGCHLD, on_sigchld, PID_TO_PTR(*pid));
5176 
5177         if (arg_expose_ports) {
5178                 r = expose_port_watch_rtnl(event, rtnl_socket_pair[0], on_address_change, expose_args, &rtnl);
5179                 if (r < 0)
5180                         return r;
5181 
5182                 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5183                 (void) expose_port_execute(rtnl, &expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5184         }
5185 
5186         rtnl_socket_pair[0] = safe_close(rtnl_socket_pair[0]);
5187 
5188         if (arg_console_mode != CONSOLE_PIPE) {
5189                 _cleanup_close_ int fd = -1;
5190                 PTYForwardFlags flags = 0;
5191 
5192                 /* Retrieve the master pty allocated by inner child */
5193                 fd = receive_one_fd(master_pty_socket_pair[0], 0);
5194                 if (fd < 0)
5195                         return log_error_errno(fd, "Failed to receive master pty from the inner child: %m");
5196 
5197                 switch (arg_console_mode) {
5198 
5199                 case CONSOLE_READ_ONLY:
5200                         flags |= PTY_FORWARD_READ_ONLY;
5201 
5202                         _fallthrough_;
5203 
5204                 case CONSOLE_INTERACTIVE:
5205                         flags |= PTY_FORWARD_IGNORE_VHANGUP;
5206 
5207                         r = pty_forward_new(event, fd, flags, &forward);
5208                         if (r < 0)
5209                                 return log_error_errno(r, "Failed to create PTY forwarder: %m");
5210 
5211                         if (arg_console_width != UINT_MAX || arg_console_height != UINT_MAX)
5212                                 (void) pty_forward_set_width_height(forward,
5213                                                                     arg_console_width,
5214                                                                     arg_console_height);
5215                         break;
5216 
5217                 default:
5218                         assert(arg_console_mode == CONSOLE_PASSIVE);
5219                 }
5220 
5221                 *master = TAKE_FD(fd);
5222         }
5223 
5224         r = sd_event_loop(event);
5225         if (r < 0)
5226                 return log_error_errno(r, "Failed to run event loop: %m");
5227 
5228         if (forward) {
5229                 char last_char = 0;
5230 
5231                 (void) pty_forward_get_last_char(forward, &last_char);
5232                 forward = pty_forward_free(forward);
5233 
5234                 if (!arg_quiet && last_char != '\n')
5235                         putc('\n', stdout);
5236         }
5237 
5238         /* Kill if it is not dead yet anyway */
5239         if (!arg_register && !arg_keep_unit && bus)
5240                 terminate_scope(bus, arg_machine);
5241 
5242         /* Normally redundant, but better safe than sorry */
5243         (void) kill(*pid, SIGKILL);
5244 
5245         if (arg_private_network) {
5246                 /* Move network interfaces back to the parent network namespace. We use `safe_fork`
5247                  * to avoid having to move the parent to the child network namespace. */
5248                 r = safe_fork(NULL, FORK_RESET_SIGNALS|FORK_DEATHSIG|FORK_WAIT|FORK_LOG, NULL);
5249                 if (r < 0)
5250                         return r;
5251 
5252                 if (r == 0) {
5253                         _cleanup_close_ int parent_netns_fd = -1;
5254 
5255                         r = namespace_open(getpid(), NULL, NULL, &parent_netns_fd, NULL, NULL);
5256                         if (r < 0) {
5257                                 log_error_errno(r, "Failed to open parent network namespace: %m");
5258                                 _exit(EXIT_FAILURE);
5259                         }
5260 
5261                         r = namespace_enter(-1, -1, child_netns_fd, -1, -1);
5262                         if (r < 0) {
5263                                 log_error_errno(r, "Failed to enter child network namespace: %m");
5264                                 _exit(EXIT_FAILURE);
5265                         }
5266 
5267                         r = move_network_interfaces(parent_netns_fd, arg_network_interfaces);
5268                         if (r < 0)
5269                                 log_error_errno(r, "Failed to move network interfaces back to parent network namespace: %m");
5270 
5271                         _exit(r < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
5272                 }
5273         }
5274 
5275         r = wait_for_container(TAKE_PID(*pid), &container_status);
5276 
5277         /* Tell machined that we are gone. */
5278         if (bus)
5279                 (void) unregister_machine(bus, arg_machine);
5280 
5281         if (r < 0)
5282                 /* We failed to wait for the container, or the container exited abnormally. */
5283                 return r;
5284         if (r > 0 || container_status == CONTAINER_TERMINATED) {
5285                 /* r > 0 → The container exited with a non-zero status.
5286                  *         As a special case, we need to replace 133 with a different value,
5287                  *         because 133 is special-cased in the service file to reboot the container.
5288                  * otherwise → The container exited with zero status and a reboot was not requested.
5289                  */
5290                 if (r == EXIT_FORCE_RESTART)
5291                         r = EXIT_FAILURE; /* replace 133 with the general failure code */
5292                 *ret = r;
5293                 return 0; /* finito */
5294         }
5295 
5296         /* CONTAINER_REBOOTED, loop again */
5297 
5298         if (arg_keep_unit) {
5299                 /* Special handling if we are running as a service: instead of simply
5300                  * restarting the machine we want to restart the entire service, so let's
5301                  * inform systemd about this with the special exit code 133. The service
5302                  * file uses RestartForceExitStatus=133 so that this results in a full
5303                  * nspawn restart. This is necessary since we might have cgroup parameters
5304                  * set we want to have flushed out. */
5305                 *ret = EXIT_FORCE_RESTART;
5306                 return 0; /* finito */
5307         }
5308 
5309         expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET, &expose_args->address4);
5310         expose_port_flush(&expose_args->fw_ctx, arg_expose_ports, AF_INET6, &expose_args->address6);
5311 
5312         (void) remove_veth_links(veth_name, arg_network_veth_extra);
5313         *veth_created = false;
5314         return 1; /* loop again */
5315 }
5316 
initialize_rlimits(void)5317 static int initialize_rlimits(void) {
5318         /* The default resource limits the kernel passes to PID 1, as per kernel 5.16. Let's pass our container payload
5319          * the same values as the kernel originally passed to PID 1, in order to minimize differences between host and
5320          * container execution environments. */
5321 
5322         static const struct rlimit kernel_defaults[_RLIMIT_MAX] = {
5323                 [RLIMIT_AS]       = { RLIM_INFINITY,          RLIM_INFINITY          },
5324                 [RLIMIT_CORE]     = { 0,                      RLIM_INFINITY          },
5325                 [RLIMIT_CPU]      = { RLIM_INFINITY,          RLIM_INFINITY          },
5326                 [RLIMIT_DATA]     = { RLIM_INFINITY,          RLIM_INFINITY          },
5327                 [RLIMIT_FSIZE]    = { RLIM_INFINITY,          RLIM_INFINITY          },
5328                 [RLIMIT_LOCKS]    = { RLIM_INFINITY,          RLIM_INFINITY          },
5329                 [RLIMIT_MEMLOCK]  = { DEFAULT_RLIMIT_MEMLOCK, DEFAULT_RLIMIT_MEMLOCK },
5330                 [RLIMIT_MSGQUEUE] = { 819200,                 819200                 },
5331                 [RLIMIT_NICE]     = { 0,                      0                      },
5332                 [RLIMIT_NOFILE]   = { 1024,                   4096                   },
5333                 [RLIMIT_RSS]      = { RLIM_INFINITY,          RLIM_INFINITY          },
5334                 [RLIMIT_RTPRIO]   = { 0,                      0                      },
5335                 [RLIMIT_RTTIME]   = { RLIM_INFINITY,          RLIM_INFINITY          },
5336                 [RLIMIT_STACK]    = { 8388608,                RLIM_INFINITY          },
5337 
5338                 /* The kernel scales the default for RLIMIT_NPROC and RLIMIT_SIGPENDING based on the system's amount of
5339                  * RAM. To provide best compatibility we'll read these limits off PID 1 instead of hardcoding them
5340                  * here. This is safe as we know that PID 1 doesn't change these two limits and thus the original
5341                  * kernel's initialization should still be valid during runtime — at least if PID 1 is systemd. Note
5342                  * that PID 1 changes a number of other resource limits during early initialization which is why we
5343                  * don't read the other limits from PID 1 but prefer the static table above. */
5344         };
5345 
5346         int rl;
5347 
5348         for (rl = 0; rl < _RLIMIT_MAX; rl++) {
5349                 /* Let's only fill in what the user hasn't explicitly configured anyway */
5350                 if ((arg_settings_mask & (SETTING_RLIMIT_FIRST << rl)) == 0) {
5351                         const struct rlimit *v;
5352                         struct rlimit buffer;
5353 
5354                         if (IN_SET(rl, RLIMIT_NPROC, RLIMIT_SIGPENDING)) {
5355                                 /* For these two let's read the limits off PID 1. See above for an explanation. */
5356 
5357                                 if (prlimit(1, rl, NULL, &buffer) < 0)
5358                                         return log_error_errno(errno, "Failed to read resource limit RLIMIT_%s of PID 1: %m", rlimit_to_string(rl));
5359 
5360                                 v = &buffer;
5361                         } else if (rl == RLIMIT_NOFILE) {
5362                                 /* We nowadays bump RLIMIT_NOFILE's hard limit early in PID 1 for all
5363                                  * userspace. Given that nspawn containers are often run without our PID 1,
5364                                  * let's grant the containers a raised RLIMIT_NOFILE hard limit by default,
5365                                  * so that container userspace gets similar resources as host userspace
5366                                  * gets. */
5367                                 buffer = kernel_defaults[rl];
5368                                 buffer.rlim_max = MIN((rlim_t) read_nr_open(), (rlim_t) HIGH_RLIMIT_NOFILE);
5369                                 v = &buffer;
5370                         } else
5371                                 v = kernel_defaults + rl;
5372 
5373                         arg_rlimit[rl] = newdup(struct rlimit, v, 1);
5374                         if (!arg_rlimit[rl])
5375                                 return log_oom();
5376                 }
5377 
5378                 if (DEBUG_LOGGING) {
5379                         _cleanup_free_ char *k = NULL;
5380 
5381                         (void) rlimit_format(arg_rlimit[rl], &k);
5382                         log_debug("Setting RLIMIT_%s to %s.", rlimit_to_string(rl), k);
5383                 }
5384         }
5385 
5386         return 0;
5387 }
5388 
cant_be_in_netns(void)5389 static int cant_be_in_netns(void) {
5390         union sockaddr_union sa = {
5391                 .un = {
5392                         .sun_family = AF_UNIX,
5393                         .sun_path = "/run/udev/control",
5394                 },
5395         };
5396         char udev_path[STRLEN("/proc//ns/net") + DECIMAL_STR_MAX(pid_t)];
5397         _cleanup_free_ char *udev_ns = NULL, *our_ns = NULL;
5398         _cleanup_close_ int fd = -1;
5399         struct ucred ucred;
5400         int r;
5401 
5402         /* Check if we are in the same netns as udev. If we aren't, then device monitoring (and thus waiting
5403          * for loopback block devices) won't work, and we will hang. Detect this case and exit early with a
5404          * nice message. */
5405 
5406         if (!arg_image) /* only matters if --image= us used, i.e. we actually need to use loopback devices */
5407                 return 0;
5408 
5409         fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_NONBLOCK|SOCK_CLOEXEC, 0);
5410         if (fd < 0)
5411                 return log_error_errno(errno, "Failed to allocate udev control socket: %m");
5412 
5413         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
5414 
5415                 if (errno == ENOENT || ERRNO_IS_DISCONNECT(errno))
5416                         return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5417                                                "Sorry, but --image= requires access to the host's /run/ hierarchy, since we need access to udev.");
5418 
5419                 return log_error_errno(errno, "Failed to connect socket to udev control socket: %m");
5420         }
5421 
5422         r = getpeercred(fd, &ucred);
5423         if (r < 0)
5424                 return log_error_errno(r, "Failed to determine peer of udev control socket: %m");
5425 
5426         xsprintf(udev_path, "/proc/" PID_FMT "/ns/net", ucred.pid);
5427         r = readlink_malloc(udev_path, &udev_ns);
5428         if (r < 0)
5429                 return log_error_errno(r, "Failed to read network namespace of udev: %m");
5430 
5431         r = readlink_malloc("/proc/self/ns/net", &our_ns);
5432         if (r < 0)
5433                 return log_error_errno(r, "Failed to read our own network namespace: %m");
5434 
5435         if (!streq(our_ns, udev_ns))
5436                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
5437                                        "Sorry, but --image= is only supported in the main network namespace, since we need access to udev/AF_NETLINK.");
5438         return 0;
5439 }
5440 
run(int argc,char * argv[])5441 static int run(int argc, char *argv[]) {
5442         bool secondary = false, remove_directory = false, remove_image = false,
5443                 veth_created = false, remove_tmprootdir = false;
5444         _cleanup_close_ int master = -1;
5445         _cleanup_fdset_free_ FDSet *fds = NULL;
5446         int r, n_fd_passed, ret = EXIT_SUCCESS;
5447         char veth_name[IFNAMSIZ] = "";
5448         struct ExposeArgs expose_args = {};
5449         _cleanup_(release_lock_file) LockFile tree_global_lock = LOCK_FILE_INIT, tree_local_lock = LOCK_FILE_INIT;
5450         char tmprootdir[] = "/tmp/nspawn-root-XXXXXX";
5451         _cleanup_(loop_device_unrefp) LoopDevice *loop = NULL;
5452         _cleanup_(decrypted_image_unrefp) DecryptedImage *decrypted_image = NULL;
5453         _cleanup_(dissected_image_unrefp) DissectedImage *dissected_image = NULL;
5454         _cleanup_(fw_ctx_freep) FirewallContext *fw_ctx = NULL;
5455         pid_t pid = 0;
5456 
5457         log_parse_environment();
5458         log_open();
5459 
5460         r = parse_argv(argc, argv);
5461         if (r <= 0)
5462                 goto finish;
5463 
5464         if (geteuid() != 0) {
5465                 r = log_warning_errno(SYNTHETIC_ERRNO(EPERM),
5466                                       argc >= 2 ? "Need to be root." :
5467                                       "Need to be root (and some arguments are usually required).\nHint: try --help");
5468                 goto finish;
5469         }
5470 
5471         r = cant_be_in_netns();
5472         if (r < 0)
5473                 goto finish;
5474 
5475         r = initialize_rlimits();
5476         if (r < 0)
5477                 goto finish;
5478 
5479         r = load_oci_bundle();
5480         if (r < 0)
5481                 goto finish;
5482 
5483         r = determine_names();
5484         if (r < 0)
5485                 goto finish;
5486 
5487         r = load_settings();
5488         if (r < 0)
5489                 goto finish;
5490 
5491         r = cg_unified();
5492         if (r < 0) {
5493                 log_error_errno(r, "Failed to determine whether the unified cgroups hierarchy is used: %m");
5494                 goto finish;
5495         }
5496 
5497         r = verify_arguments();
5498         if (r < 0)
5499                 goto finish;
5500 
5501         /* Reapply environment settings. */
5502         (void) detect_unified_cgroup_hierarchy_from_environment();
5503 
5504         /* Ignore SIGPIPE here, because we use splice() on the ptyfwd stuff and that will generate SIGPIPE if
5505          * the result is closed. Note that the container payload child will reset signal mask+handler anyway,
5506          * so just turning this off here means we only turn it off in nspawn itself, not any children. */
5507         (void) ignore_signals(SIGPIPE);
5508 
5509         n_fd_passed = sd_listen_fds(false);
5510         if (n_fd_passed > 0) {
5511                 r = fdset_new_listen_fds(&fds, false);
5512                 if (r < 0) {
5513                         log_error_errno(r, "Failed to collect file descriptors: %m");
5514                         goto finish;
5515                 }
5516         }
5517 
5518         /* The "default" umask. This is appropriate for most file and directory
5519         * operations performed by nspawn, and is the umask that will be used for
5520         * the child. Functions like copy_devnodes() change the umask temporarily. */
5521         umask(0022);
5522 
5523         if (arg_directory) {
5524                 assert(!arg_image);
5525 
5526                 /* Safety precaution: let's not allow running images from the live host OS image, as long as
5527                  * /var from the host will propagate into container dynamically (because bad things happen if
5528                  * two systems write to the same /var). Let's allow it for the special cases where /var is
5529                  * either copied (i.e. --ephemeral) or replaced (i.e. --volatile=yes|state). */
5530                 if (path_equal(arg_directory, "/") && !(arg_ephemeral || IN_SET(arg_volatile_mode, VOLATILE_YES, VOLATILE_STATE))) {
5531                         log_error("Spawning container on root directory is not supported. Consider using --ephemeral, --volatile=yes or --volatile=state.");
5532                         r = -EINVAL;
5533                         goto finish;
5534                 }
5535 
5536                 if (arg_ephemeral) {
5537                         _cleanup_free_ char *np = NULL;
5538 
5539                         r = chase_symlinks_and_update(&arg_directory, 0);
5540                         if (r < 0)
5541                                 goto finish;
5542 
5543                         /* If the specified path is a mount point we generate the new snapshot immediately
5544                          * inside it under a random name. However if the specified is not a mount point we
5545                          * create the new snapshot in the parent directory, just next to it. */
5546                         r = path_is_mount_point(arg_directory, NULL, 0);
5547                         if (r < 0) {
5548                                 log_error_errno(r, "Failed to determine whether directory %s is mount point: %m", arg_directory);
5549                                 goto finish;
5550                         }
5551                         if (r > 0)
5552                                 r = tempfn_random_child(arg_directory, "machine.", &np);
5553                         else
5554                                 r = tempfn_random(arg_directory, "machine.", &np);
5555                         if (r < 0) {
5556                                 log_error_errno(r, "Failed to generate name for directory snapshot: %m");
5557                                 goto finish;
5558                         }
5559 
5560                         /* We take an exclusive lock on this image, since it's our private, ephemeral copy
5561                          * only owned by us and no one else. */
5562                         r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5563                         if (r < 0) {
5564                                 log_error_errno(r, "Failed to lock %s: %m", np);
5565                                 goto finish;
5566                         }
5567 
5568                         {
5569                                 BLOCK_SIGNALS(SIGINT);
5570                                 r = btrfs_subvol_snapshot(arg_directory, np,
5571                                                           (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5572                                                           BTRFS_SNAPSHOT_FALLBACK_COPY |
5573                                                           BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5574                                                           BTRFS_SNAPSHOT_RECURSIVE |
5575                                                           BTRFS_SNAPSHOT_QUOTA |
5576                                                           BTRFS_SNAPSHOT_SIGINT);
5577                         }
5578                         if (r == -EINTR) {
5579                                 log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", np);
5580                                 goto finish;
5581                         }
5582                         if (r < 0) {
5583                                 log_error_errno(r, "Failed to create snapshot %s from %s: %m", np, arg_directory);
5584                                 goto finish;
5585                         }
5586 
5587                         free_and_replace(arg_directory, np);
5588                         remove_directory = true;
5589                 } else {
5590                         r = chase_symlinks_and_update(&arg_directory, arg_template ? CHASE_NONEXISTENT : 0);
5591                         if (r < 0)
5592                                 goto finish;
5593 
5594                         r = image_path_lock(arg_directory, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5595                         if (r == -EBUSY) {
5596                                 log_error_errno(r, "Directory tree %s is currently busy.", arg_directory);
5597                                 goto finish;
5598                         }
5599                         if (r < 0) {
5600                                 log_error_errno(r, "Failed to lock %s: %m", arg_directory);
5601                                 goto finish;
5602                         }
5603 
5604                         if (arg_template) {
5605                                 r = chase_symlinks_and_update(&arg_template, 0);
5606                                 if (r < 0)
5607                                         goto finish;
5608 
5609                                 {
5610                                         BLOCK_SIGNALS(SIGINT);
5611                                         r = btrfs_subvol_snapshot(arg_template, arg_directory,
5612                                                                   (arg_read_only ? BTRFS_SNAPSHOT_READ_ONLY : 0) |
5613                                                                   BTRFS_SNAPSHOT_FALLBACK_COPY |
5614                                                                   BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
5615                                                                   BTRFS_SNAPSHOT_FALLBACK_IMMUTABLE |
5616                                                                   BTRFS_SNAPSHOT_RECURSIVE |
5617                                                                   BTRFS_SNAPSHOT_QUOTA |
5618                                                                   BTRFS_SNAPSHOT_SIGINT);
5619                                 }
5620                                 if (r == -EEXIST)
5621                                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5622                                                  "Directory %s already exists, not populating from template %s.", arg_directory, arg_template);
5623                                 else if (r == -EINTR) {
5624                                         log_error_errno(r, "Interrupted while copying file system tree to %s, removed again.", arg_directory);
5625                                         goto finish;
5626                                 } else if (r < 0) {
5627                                         log_error_errno(r, "Couldn't create snapshot %s from %s: %m", arg_directory, arg_template);
5628                                         goto finish;
5629                                 } else
5630                                         log_full(arg_quiet ? LOG_DEBUG : LOG_INFO,
5631                                                  "Populated %s from template %s.", arg_directory, arg_template);
5632                         }
5633                 }
5634 
5635                 if (arg_start_mode == START_BOOT) {
5636                         _cleanup_free_ char *b = NULL;
5637                         const char *p;
5638 
5639                         if (arg_pivot_root_new) {
5640                                 b = path_join(arg_directory, arg_pivot_root_new);
5641                                 if (!b)
5642                                         return log_oom();
5643 
5644                                 p = b;
5645                         } else
5646                                 p = arg_directory;
5647 
5648                         if (path_is_os_tree(p) <= 0) {
5649                                 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5650                                                     "Directory %s doesn't look like an OS root directory (os-release file is missing). Refusing.", p);
5651                                 goto finish;
5652                         }
5653                 } else {
5654                         _cleanup_free_ char *p = NULL;
5655 
5656                         if (arg_pivot_root_new)
5657                                 p = path_join(arg_directory, arg_pivot_root_new, "/usr/");
5658                         else
5659                                 p = path_join(arg_directory, "/usr/");
5660                         if (!p)
5661                                 return log_oom();
5662 
5663                         if (laccess(p, F_OK) < 0) {
5664                                 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
5665                                                     "Directory %s doesn't look like it has an OS tree (/usr/ directory is missing). Refusing.", arg_directory);
5666                                 goto finish;
5667                         }
5668                 }
5669 
5670         } else {
5671                 DissectImageFlags dissect_image_flags =
5672                         DISSECT_IMAGE_GENERIC_ROOT |
5673                         DISSECT_IMAGE_REQUIRE_ROOT |
5674                         DISSECT_IMAGE_RELAX_VAR_CHECK |
5675                         DISSECT_IMAGE_USR_NO_ROOT;
5676                 assert(arg_image);
5677                 assert(!arg_template);
5678 
5679                 r = chase_symlinks_and_update(&arg_image, 0);
5680                 if (r < 0)
5681                         goto finish;
5682 
5683                 if (arg_ephemeral)  {
5684                         _cleanup_free_ char *np = NULL;
5685 
5686                         r = tempfn_random(arg_image, "machine.", &np);
5687                         if (r < 0) {
5688                                 log_error_errno(r, "Failed to generate name for image snapshot: %m");
5689                                 goto finish;
5690                         }
5691 
5692                         /* Always take an exclusive lock on our own ephemeral copy. */
5693                         r = image_path_lock(np, LOCK_EX|LOCK_NB, &tree_global_lock, &tree_local_lock);
5694                         if (r < 0) {
5695                                 r = log_error_errno(r, "Failed to create image lock: %m");
5696                                 goto finish;
5697                         }
5698 
5699                         {
5700                                 BLOCK_SIGNALS(SIGINT);
5701                                 r = copy_file(arg_image, np, O_EXCL, arg_read_only ? 0400 : 0600, FS_NOCOW_FL, FS_NOCOW_FL, COPY_REFLINK|COPY_CRTIME|COPY_SIGINT);
5702                         }
5703                         if (r == -EINTR) {
5704                                 log_error_errno(r, "Interrupted while copying image file to %s, removed again.", np);
5705                                 goto finish;
5706                         }
5707                         if (r < 0) {
5708                                 r = log_error_errno(r, "Failed to copy image file: %m");
5709                                 goto finish;
5710                         }
5711 
5712                         free_and_replace(arg_image, np);
5713                         remove_image = true;
5714                 } else {
5715                         r = image_path_lock(arg_image, (arg_read_only ? LOCK_SH : LOCK_EX) | LOCK_NB, &tree_global_lock, &tree_local_lock);
5716                         if (r == -EBUSY) {
5717                                 r = log_error_errno(r, "Disk image %s is currently busy.", arg_image);
5718                                 goto finish;
5719                         }
5720                         if (r < 0) {
5721                                 r = log_error_errno(r, "Failed to create image lock: %m");
5722                                 goto finish;
5723                         }
5724 
5725                         r = verity_settings_load(
5726                                         &arg_verity_settings,
5727                                         arg_image, NULL, NULL);
5728                         if (r < 0) {
5729                                 log_error_errno(r, "Failed to read verity artefacts for %s: %m", arg_image);
5730                                 goto finish;
5731                         }
5732 
5733                         if (arg_verity_settings.data_path)
5734                                 dissect_image_flags |= DISSECT_IMAGE_NO_PARTITION_TABLE;
5735                 }
5736 
5737                 if (!mkdtemp(tmprootdir)) {
5738                         r = log_error_errno(errno, "Failed to create temporary directory: %m");
5739                         goto finish;
5740                 }
5741 
5742                 remove_tmprootdir = true;
5743 
5744                 arg_directory = strdup(tmprootdir);
5745                 if (!arg_directory) {
5746                         r = log_oom();
5747                         goto finish;
5748                 }
5749 
5750                 r = loop_device_make_by_path(
5751                                 arg_image,
5752                                 arg_read_only ? O_RDONLY : O_RDWR,
5753                                 FLAGS_SET(dissect_image_flags, DISSECT_IMAGE_NO_PARTITION_TABLE) ? 0 : LO_FLAGS_PARTSCAN,
5754                                 &loop);
5755                 if (r < 0) {
5756                         log_error_errno(r, "Failed to set up loopback block device: %m");
5757                         goto finish;
5758                 }
5759 
5760                 /* Take a LOCK_SH lock on the device, so that udevd doesn't issue BLKRRPART in our back */
5761                 r = loop_device_flock(loop, LOCK_SH);
5762                 if (r < 0) {
5763                         log_error_errno(r, "Failed to take lock on loopback block device: %m");
5764                         goto finish;
5765                 }
5766 
5767                 r = dissect_image_and_warn(
5768                                 loop->fd,
5769                                 arg_image,
5770                                 &arg_verity_settings,
5771                                 NULL,
5772                                 loop->diskseq,
5773                                 loop->uevent_seqnum_not_before,
5774                                 loop->timestamp_not_before,
5775                                 dissect_image_flags,
5776                                 &dissected_image);
5777                 if (r == -ENOPKG) {
5778                         /* dissected_image_and_warn() already printed a brief error message. Extend on that with more details */
5779                         log_notice("Note that the disk image needs to\n"
5780                                    "    a) either contain only a single MBR partition of type 0x83 that is marked bootable\n"
5781                                    "    b) or contain a single GPT partition of type 0FC63DAF-8483-4772-8E79-3D69D8477DE4\n"
5782                                    "    c) or follow https://systemd.io/DISCOVERABLE_PARTITIONS\n"
5783                                    "    d) or contain a file system without a partition table\n"
5784                                    "in order to be bootable with systemd-nspawn.");
5785                         goto finish;
5786                 }
5787                 if (r < 0)
5788                         goto finish;
5789 
5790                 r = dissected_image_load_verity_sig_partition(
5791                                 dissected_image,
5792                                 loop->fd,
5793                                 &arg_verity_settings);
5794                 if (r < 0)
5795                         goto finish;
5796 
5797                 if (dissected_image->has_verity && !arg_verity_settings.root_hash && !dissected_image->has_verity_sig)
5798                         log_notice("Note: image %s contains verity information, but no root hash specified and no embedded "
5799                                    "root hash signature found! Proceeding without integrity checking.", arg_image);
5800 
5801                 r = dissected_image_decrypt_interactively(
5802                                 dissected_image,
5803                                 NULL,
5804                                 &arg_verity_settings,
5805                                 0,
5806                                 &decrypted_image);
5807                 if (r < 0)
5808                         goto finish;
5809 
5810                 /* Now that we mounted the image, let's try to remove it again, if it is ephemeral */
5811                 if (remove_image && unlink(arg_image) >= 0)
5812                         remove_image = false;
5813         }
5814 
5815         r = custom_mount_prepare_all(arg_directory, arg_custom_mounts, arg_n_custom_mounts);
5816         if (r < 0)
5817                 goto finish;
5818 
5819         if (arg_console_mode < 0)
5820                 arg_console_mode =
5821                         isatty(STDIN_FILENO) > 0 &&
5822                         isatty(STDOUT_FILENO) > 0 ? CONSOLE_INTERACTIVE : CONSOLE_READ_ONLY;
5823 
5824         if (arg_console_mode == CONSOLE_PIPE) /* if we pass STDERR on to the container, don't add our own logs into it too */
5825                 arg_quiet = true;
5826 
5827         if (!arg_quiet)
5828                 log_info("Spawning container %s on %s.\nPress ^] three times within 1s to kill container.",
5829                          arg_machine, arg_image ?: arg_directory);
5830 
5831         assert_se(sigprocmask_many(SIG_BLOCK, NULL, SIGCHLD, SIGWINCH, SIGTERM, SIGINT, -1) >= 0);
5832 
5833         if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0) < 0) {
5834                 r = log_error_errno(errno, "Failed to become subreaper: %m");
5835                 goto finish;
5836         }
5837 
5838         if (arg_expose_ports) {
5839                 r = fw_ctx_new(&fw_ctx);
5840                 if (r < 0) {
5841                         log_error_errno(r, "Cannot expose configured ports, firewall initialization failed: %m");
5842                         goto finish;
5843                 }
5844                 expose_args.fw_ctx = fw_ctx;
5845         }
5846         for (;;) {
5847                 r = run_container(dissected_image,
5848                                   secondary,
5849                                   fds,
5850                                   veth_name, &veth_created,
5851                                   &expose_args, &master,
5852                                   &pid, &ret);
5853                 if (r <= 0)
5854                         break;
5855         }
5856 
5857 finish:
5858         (void) sd_notify(false,
5859                          r == 0 && ret == EXIT_FORCE_RESTART ? "STOPPING=1\nSTATUS=Restarting..." :
5860                                                                "STOPPING=1\nSTATUS=Terminating...");
5861 
5862         if (pid > 0)
5863                 (void) kill(pid, SIGKILL);
5864 
5865         /* Try to flush whatever is still queued in the pty */
5866         if (master >= 0) {
5867                 (void) copy_bytes(master, STDOUT_FILENO, UINT64_MAX, 0);
5868                 master = safe_close(master);
5869         }
5870 
5871         if (pid > 0)
5872                 (void) wait_for_terminate(pid, NULL);
5873 
5874         pager_close();
5875 
5876         if (remove_directory && arg_directory) {
5877                 int k;
5878 
5879                 k = rm_rf(arg_directory, REMOVE_ROOT|REMOVE_PHYSICAL|REMOVE_SUBVOLUME);
5880                 if (k < 0)
5881                         log_warning_errno(k, "Cannot remove '%s', ignoring: %m", arg_directory);
5882         }
5883 
5884         if (remove_image && arg_image) {
5885                 if (unlink(arg_image) < 0)
5886                         log_warning_errno(errno, "Can't remove image file '%s', ignoring: %m", arg_image);
5887         }
5888 
5889         if (remove_tmprootdir) {
5890                 if (rmdir(tmprootdir) < 0)
5891                         log_debug_errno(errno, "Can't remove temporary root directory '%s', ignoring: %m", tmprootdir);
5892         }
5893 
5894         if (arg_machine) {
5895                 const char *p;
5896 
5897                 p = strjoina("/run/systemd/nspawn/propagate/", arg_machine);
5898                 (void) rm_rf(p, REMOVE_ROOT);
5899         }
5900 
5901         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET,  &expose_args.address4);
5902         expose_port_flush(&fw_ctx, arg_expose_ports, AF_INET6, &expose_args.address6);
5903 
5904         if (veth_created)
5905                 (void) remove_veth_links(veth_name, arg_network_veth_extra);
5906         (void) remove_bridge(arg_network_zone);
5907 
5908         custom_mount_free_all(arg_custom_mounts, arg_n_custom_mounts);
5909         expose_port_free_all(arg_expose_ports);
5910         rlimit_free_all(arg_rlimit);
5911         device_node_array_free(arg_extra_nodes, arg_n_extra_nodes);
5912         credential_free_all(arg_credentials, arg_n_credentials);
5913 
5914         if (r < 0)
5915                 return r;
5916 
5917         return ret;
5918 }
5919 
5920 DEFINE_MAIN_FUNCTION_WITH_POSITIVE_FAILURE(run);
5921