1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <sys/mount.h>
4 #include <linux/magic.h>
5 
6 #include "alloc-util.h"
7 #include "chase-symlinks.h"
8 #include "escape.h"
9 #include "fd-util.h"
10 #include "format-util.h"
11 #include "fs-util.h"
12 #include "label.h"
13 #include "mkdir-label.h"
14 #include "mount-util.h"
15 #include "mountpoint-util.h"
16 #include "nspawn-mount.h"
17 #include "parse-util.h"
18 #include "path-util.h"
19 #include "rm-rf.h"
20 #include "set.h"
21 #include "sort-util.h"
22 #include "stat-util.h"
23 #include "string-util.h"
24 #include "strv.h"
25 #include "tmpfile-util.h"
26 #include "user-util.h"
27 
custom_mount_add(CustomMount ** l,size_t * n,CustomMountType t)28 CustomMount* custom_mount_add(CustomMount **l, size_t *n, CustomMountType t) {
29         CustomMount *c, *ret;
30 
31         assert(l);
32         assert(n);
33         assert(t >= 0);
34         assert(t < _CUSTOM_MOUNT_TYPE_MAX);
35 
36         c = reallocarray(*l, *n + 1, sizeof(CustomMount));
37         if (!c)
38                 return NULL;
39 
40         *l = c;
41         ret = *l + *n;
42         (*n)++;
43 
44         *ret = (CustomMount) {
45                 .type = t
46         };
47 
48         return ret;
49 }
50 
custom_mount_free_all(CustomMount * l,size_t n)51 void custom_mount_free_all(CustomMount *l, size_t n) {
52         for (size_t i = 0; i < n; i++) {
53                 CustomMount *m = l + i;
54 
55                 free(m->source);
56                 free(m->destination);
57                 free(m->options);
58 
59                 if (m->work_dir) {
60                         (void) rm_rf(m->work_dir, REMOVE_ROOT|REMOVE_PHYSICAL);
61                         free(m->work_dir);
62                 }
63 
64                 if (m->rm_rf_tmpdir) {
65                         (void) rm_rf(m->rm_rf_tmpdir, REMOVE_ROOT|REMOVE_PHYSICAL);
66                         free(m->rm_rf_tmpdir);
67                 }
68 
69                 strv_free(m->lower);
70                 free(m->type_argument);
71         }
72 
73         free(l);
74 }
75 
custom_mount_compare(const CustomMount * a,const CustomMount * b)76 static int custom_mount_compare(const CustomMount *a, const CustomMount *b) {
77         int r;
78 
79         r = path_compare(a->destination, b->destination);
80         if (r != 0)
81                 return r;
82 
83         return CMP(a->type, b->type);
84 }
85 
source_path_is_valid(const char * p)86 static bool source_path_is_valid(const char *p) {
87         assert(p);
88 
89         if (*p == '+')
90                 p++;
91 
92         return path_is_absolute(p);
93 }
94 
resolve_source_path(const char * dest,const char * source)95 static char *resolve_source_path(const char *dest, const char *source) {
96         if (!source)
97                 return NULL;
98 
99         if (source[0] == '+')
100                 return path_join(dest, source + 1);
101 
102         return strdup(source);
103 }
104 
allocate_temporary_source(CustomMount * m)105 static int allocate_temporary_source(CustomMount *m) {
106         assert(m);
107         assert(!m->source);
108         assert(!m->rm_rf_tmpdir);
109 
110         m->rm_rf_tmpdir = strdup("/var/tmp/nspawn-temp-XXXXXX");
111         if (!m->rm_rf_tmpdir)
112                 return log_oom();
113 
114         if (!mkdtemp(m->rm_rf_tmpdir)) {
115                 m->rm_rf_tmpdir = mfree(m->rm_rf_tmpdir);
116                 return log_error_errno(errno, "Failed to acquire temporary directory: %m");
117         }
118 
119         m->source = path_join(m->rm_rf_tmpdir, "src");
120         if (!m->source)
121                 return log_oom();
122 
123         if (mkdir(m->source, 0755) < 0)
124                 return log_error_errno(errno, "Failed to create %s: %m", m->source);
125 
126         return 0;
127 }
128 
custom_mount_prepare_all(const char * dest,CustomMount * l,size_t n)129 int custom_mount_prepare_all(const char *dest, CustomMount *l, size_t n) {
130         int r;
131 
132         /* Prepare all custom mounts. This will make source we know all temporary directories. This is called in the
133          * parent process, so that we know the temporary directories to remove on exit before we fork off the
134          * children. */
135 
136         assert(l || n == 0);
137 
138         /* Order the custom mounts, and make sure we have a working directory */
139         typesafe_qsort(l, n, custom_mount_compare);
140 
141         for (size_t i = 0; i < n; i++) {
142                 CustomMount *m = l + i;
143 
144                 /* /proc we mount in the inner child, i.e. when we acquired CLONE_NEWPID. All other mounts we mount
145                  * already in the outer child, so that the mounts are already established before CLONE_NEWPID and in
146                  * particular CLONE_NEWUSER. This also means any custom mounts below /proc also need to be mounted in
147                  * the inner child, not the outer one. Determine this here. */
148                 m->in_userns = path_startswith(m->destination, "/proc");
149 
150                 if (m->type == CUSTOM_MOUNT_BIND) {
151                         if (m->source) {
152                                 char *s;
153 
154                                 s = resolve_source_path(dest, m->source);
155                                 if (!s)
156                                         return log_oom();
157 
158                                 free_and_replace(m->source, s);
159                         } else {
160                                 /* No source specified? In that case, use a throw-away temporary directory in /var/tmp */
161 
162                                 r = allocate_temporary_source(m);
163                                 if (r < 0)
164                                         return r;
165                         }
166                 }
167 
168                 if (m->type == CUSTOM_MOUNT_OVERLAY) {
169                         STRV_FOREACH(j, m->lower) {
170                                 char *s;
171 
172                                 s = resolve_source_path(dest, *j);
173                                 if (!s)
174                                         return log_oom();
175 
176                                 free_and_replace(*j, s);
177                         }
178 
179                         if (m->source) {
180                                 char *s;
181 
182                                 s = resolve_source_path(dest, m->source);
183                                 if (!s)
184                                         return log_oom();
185 
186                                 free_and_replace(m->source, s);
187                         } else {
188                                 r = allocate_temporary_source(m);
189                                 if (r < 0)
190                                         return r;
191                         }
192 
193                         if (m->work_dir) {
194                                 char *s;
195 
196                                 s = resolve_source_path(dest, m->work_dir);
197                                 if (!s)
198                                         return log_oom();
199 
200                                 free_and_replace(m->work_dir, s);
201                         } else {
202                                 r = tempfn_random(m->source, NULL, &m->work_dir);
203                                 if (r < 0)
204                                         return log_error_errno(r, "Failed to acquire working directory: %m");
205                         }
206 
207                         (void) mkdir_label(m->work_dir, 0700);
208                 }
209         }
210 
211         return 0;
212 }
213 
bind_mount_parse(CustomMount ** l,size_t * n,const char * s,bool read_only)214 int bind_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
215         _cleanup_free_ char *source = NULL, *destination = NULL, *opts = NULL;
216         const char *p = s;
217         CustomMount *m;
218         int r;
219 
220         assert(l);
221         assert(n);
222 
223         r = extract_many_words(&p, ":", EXTRACT_DONT_COALESCE_SEPARATORS, &source, &destination, NULL);
224         if (r < 0)
225                 return r;
226         if (r == 0)
227                 return -EINVAL;
228         if (r == 1) {
229                 destination = strdup(source[0] == '+' ? source+1 : source);
230                 if (!destination)
231                         return -ENOMEM;
232         }
233         if (r == 2 && !isempty(p)) {
234                 opts = strdup(p);
235                 if (!opts)
236                         return -ENOMEM;
237         }
238 
239         if (isempty(source))
240                 source = mfree(source);
241         else if (!source_path_is_valid(source))
242                 return -EINVAL;
243 
244         if (!path_is_absolute(destination))
245                 return -EINVAL;
246 
247         m = custom_mount_add(l, n, CUSTOM_MOUNT_BIND);
248         if (!m)
249                 return -ENOMEM;
250 
251         m->source = TAKE_PTR(source);
252         m->destination = TAKE_PTR(destination);
253         m->read_only = read_only;
254         m->options = TAKE_PTR(opts);
255 
256         return 0;
257 }
258 
tmpfs_mount_parse(CustomMount ** l,size_t * n,const char * s)259 int tmpfs_mount_parse(CustomMount **l, size_t *n, const char *s) {
260         _cleanup_free_ char *path = NULL, *opts = NULL;
261         const char *p = s;
262         CustomMount *m;
263         int r;
264 
265         assert(l);
266         assert(n);
267         assert(s);
268 
269         r = extract_first_word(&p, &path, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
270         if (r < 0)
271                 return r;
272         if (r == 0)
273                 return -EINVAL;
274 
275         if (isempty(p))
276                 opts = strdup("mode=0755");
277         else
278                 opts = strdup(p);
279         if (!opts)
280                 return -ENOMEM;
281 
282         if (!path_is_absolute(path))
283                 return -EINVAL;
284 
285         m = custom_mount_add(l, n, CUSTOM_MOUNT_TMPFS);
286         if (!m)
287                 return -ENOMEM;
288 
289         m->destination = TAKE_PTR(path);
290         m->options = TAKE_PTR(opts);
291 
292         return 0;
293 }
294 
overlay_mount_parse(CustomMount ** l,size_t * n,const char * s,bool read_only)295 int overlay_mount_parse(CustomMount **l, size_t *n, const char *s, bool read_only) {
296         _cleanup_free_ char *upper = NULL, *destination = NULL;
297         _cleanup_strv_free_ char **lower = NULL;
298         CustomMount *m;
299         int k;
300 
301         k = strv_split_full(&lower, s, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
302         if (k < 0)
303                 return k;
304         if (k < 2)
305                 return -EADDRNOTAVAIL;
306         if (k == 2) {
307                 /* If two parameters are specified, the first one is the lower, the second one the upper directory. And
308                  * we'll also define the destination mount point the same as the upper. */
309 
310                 if (!source_path_is_valid(lower[0]) ||
311                     !source_path_is_valid(lower[1]))
312                         return -EINVAL;
313 
314                 upper = TAKE_PTR(lower[1]);
315 
316                 destination = strdup(upper[0] == '+' ? upper+1 : upper); /* take the destination without "+" prefix */
317                 if (!destination)
318                         return -ENOMEM;
319         } else {
320                 /* If more than two parameters are specified, the last one is the destination, the second to last one
321                  * the "upper", and all before that the "lower" directories. */
322 
323                 destination = lower[k - 1];
324                 upper = TAKE_PTR(lower[k - 2]);
325 
326                 STRV_FOREACH(i, lower)
327                         if (!source_path_is_valid(*i))
328                                 return -EINVAL;
329 
330                 /* If the upper directory is unspecified, then let's create it automatically as a throw-away directory
331                  * in /var/tmp */
332                 if (isempty(upper))
333                         upper = mfree(upper);
334                 else if (!source_path_is_valid(upper))
335                         return -EINVAL;
336 
337                 if (!path_is_absolute(destination))
338                         return -EINVAL;
339         }
340 
341         m = custom_mount_add(l, n, CUSTOM_MOUNT_OVERLAY);
342         if (!m)
343                 return -ENOMEM;
344 
345         m->destination = TAKE_PTR(destination);
346         m->source = TAKE_PTR(upper);
347         m->lower = TAKE_PTR(lower);
348         m->read_only = read_only;
349 
350         return 0;
351 }
352 
inaccessible_mount_parse(CustomMount ** l,size_t * n,const char * s)353 int inaccessible_mount_parse(CustomMount **l, size_t *n, const char *s) {
354         _cleanup_free_ char *path = NULL;
355         CustomMount *m;
356 
357         assert(l);
358         assert(n);
359         assert(s);
360 
361         if (!path_is_absolute(s))
362                 return -EINVAL;
363 
364         path = strdup(s);
365         if (!path)
366                 return -ENOMEM;
367 
368         m = custom_mount_add(l, n, CUSTOM_MOUNT_INACCESSIBLE);
369         if (!m)
370                 return -ENOMEM;
371 
372         m->destination = TAKE_PTR(path);
373         return 0;
374 }
375 
tmpfs_patch_options(const char * options,uid_t uid_shift,const char * selinux_apifs_context,char ** ret)376 int tmpfs_patch_options(
377                 const char *options,
378                 uid_t uid_shift,
379                 const char *selinux_apifs_context,
380                 char **ret) {
381 
382         _cleanup_free_ char *buf = NULL;
383 
384         assert(ret);
385 
386         if (options) {
387                 buf = strdup(options);
388                 if (!buf)
389                         return -ENOMEM;
390         }
391 
392         if (uid_shift != UID_INVALID)
393                 if (strextendf_with_separator(&buf, ",", "uid=" UID_FMT ",gid=" UID_FMT, uid_shift, uid_shift) < 0)
394                         return -ENOMEM;
395 
396 #if HAVE_SELINUX
397         if (selinux_apifs_context)
398                 if (strextendf_with_separator(&buf, ",", "context=\"%s\"", selinux_apifs_context) < 0)
399                         return -ENOMEM;
400 #endif
401 
402         *ret = TAKE_PTR(buf);
403         return !!*ret;
404 }
405 
mount_sysfs(const char * dest,MountSettingsMask mount_settings)406 int mount_sysfs(const char *dest, MountSettingsMask mount_settings) {
407         const char *full, *top;
408         int r;
409         unsigned long extra_flags = 0;
410 
411         top = prefix_roota(dest, "/sys");
412         r = path_is_fs_type(top, SYSFS_MAGIC);
413         if (r < 0)
414                 return log_error_errno(r, "Failed to determine filesystem type of %s: %m", top);
415         /* /sys might already be mounted as sysfs by the outer child in the
416          * !netns case. In this case, it's all good. Don't touch it because we
417          * don't have the right to do so, see https://github.com/systemd/systemd/issues/1555.
418          */
419         if (r > 0)
420                 return 0;
421 
422         full = prefix_roota(top, "/full");
423 
424         (void) mkdir(full, 0755);
425 
426         if (FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO))
427                 extra_flags |= MS_RDONLY;
428 
429         r = mount_nofollow_verbose(LOG_ERR, "sysfs", full, "sysfs",
430                                    MS_NOSUID|MS_NOEXEC|MS_NODEV|extra_flags, NULL);
431         if (r < 0)
432                 return r;
433 
434         FOREACH_STRING(x, "block", "bus", "class", "dev", "devices", "kernel") {
435                 _cleanup_free_ char *from = NULL, *to = NULL;
436 
437                 from = path_join(full, x);
438                 if (!from)
439                         return log_oom();
440 
441                 to = path_join(top, x);
442                 if (!to)
443                         return log_oom();
444 
445                 (void) mkdir(to, 0755);
446 
447                 r = mount_nofollow_verbose(LOG_ERR, from, to, NULL, MS_BIND, NULL);
448                 if (r < 0)
449                         return r;
450 
451                 r = mount_nofollow_verbose(LOG_ERR, NULL, to, NULL,
452                                            MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
453                 if (r < 0)
454                         return r;
455         }
456 
457         r = umount_verbose(LOG_ERR, full, UMOUNT_NOFOLLOW);
458         if (r < 0)
459                 return r;
460 
461         if (rmdir(full) < 0)
462                 return log_error_errno(errno, "Failed to remove %s: %m", full);
463 
464         /* Create mountpoint for cgroups. Otherwise we are not allowed since we
465          * remount /sys read-only.
466          */
467         const char *x = prefix_roota(top, "/fs/cgroup");
468         (void) mkdir_p(x, 0755);
469 
470         return mount_nofollow_verbose(LOG_ERR, NULL, top, NULL,
471                                       MS_BIND|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT|extra_flags, NULL);
472 }
473 
mount_all(const char * dest,MountSettingsMask mount_settings,uid_t uid_shift,const char * selinux_apifs_context)474 int mount_all(const char *dest,
475               MountSettingsMask mount_settings,
476               uid_t uid_shift,
477               const char *selinux_apifs_context) {
478 
479 #define PROC_INACCESSIBLE_REG(path)                                     \
480         { "/run/systemd/inaccessible/reg", (path), NULL, NULL, MS_BIND, \
481           MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
482         { NULL, (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
483           MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
484 
485 #define PROC_READ_ONLY(path)                                            \
486         { (path), (path), NULL, NULL, MS_BIND,                          \
487           MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO }, /* Bind mount first ... */ \
488         { NULL,   (path), NULL, NULL, MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT, \
489           MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO } /* Then, make it r/o */
490 
491         typedef struct MountPoint {
492                 const char *what;
493                 const char *where;
494                 const char *type;
495                 const char *options;
496                 unsigned long flags;
497                 MountSettingsMask mount_settings;
498         } MountPoint;
499 
500         static const MountPoint mount_table[] = {
501                 /* First we list inner child mounts (i.e. mounts applied *after* entering user namespacing) */
502                 { "proc",            "/proc",           "proc",  NULL,        MS_NOSUID|MS_NOEXEC|MS_NODEV,
503                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_MKDIR|MOUNT_FOLLOW_SYMLINKS }, /* we follow symlinks here since not following them requires /proc/ already being mounted, which we don't have here. */
504 
505                 { "/proc/sys",       "/proc/sys",       NULL,    NULL,        MS_BIND,
506                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* Bind mount first ... */
507 
508                 { "/proc/sys/net",   "/proc/sys/net",   NULL,    NULL,        MS_BIND,
509                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO|MOUNT_APPLY_APIVFS_NETNS }, /* (except for this) */
510 
511                 { NULL,              "/proc/sys",       NULL,    NULL,        MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
512                   MOUNT_FATAL|MOUNT_IN_USERNS|MOUNT_APPLY_APIVFS_RO },                          /* ... then, make it r/o */
513 
514                 /* Make these files inaccessible to container payloads: they potentially leak information about kernel
515                  * internals or the host's execution environment to the container */
516                 PROC_INACCESSIBLE_REG("/proc/kallsyms"),
517                 PROC_INACCESSIBLE_REG("/proc/kcore"),
518                 PROC_INACCESSIBLE_REG("/proc/keys"),
519                 PROC_INACCESSIBLE_REG("/proc/sysrq-trigger"),
520                 PROC_INACCESSIBLE_REG("/proc/timer_list"),
521 
522                 /* Make these directories read-only to container payloads: they show hardware information, and in some
523                  * cases contain tunables the container really shouldn't have access to. */
524                 PROC_READ_ONLY("/proc/acpi"),
525                 PROC_READ_ONLY("/proc/apm"),
526                 PROC_READ_ONLY("/proc/asound"),
527                 PROC_READ_ONLY("/proc/bus"),
528                 PROC_READ_ONLY("/proc/fs"),
529                 PROC_READ_ONLY("/proc/irq"),
530                 PROC_READ_ONLY("/proc/scsi"),
531 
532                 { "mqueue",                 "/dev/mqueue",                  "mqueue", NULL,                            MS_NOSUID|MS_NOEXEC|MS_NODEV,
533                   MOUNT_IN_USERNS|MOUNT_MKDIR },
534 
535                 /* Then we list outer child mounts (i.e. mounts applied *before* entering user namespacing) */
536                 { "tmpfs",                  "/tmp",                         "tmpfs", "mode=1777" NESTED_TMPFS_LIMITS,  MS_NOSUID|MS_NODEV|MS_STRICTATIME,
537                   MOUNT_FATAL|MOUNT_APPLY_TMPFS_TMP|MOUNT_MKDIR },
538                 { "tmpfs",                  "/sys",                         "tmpfs", "mode=555" TMPFS_LIMITS_SYS,      MS_NOSUID|MS_NOEXEC|MS_NODEV,
539                   MOUNT_FATAL|MOUNT_APPLY_APIVFS_NETNS|MOUNT_MKDIR },
540                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV,
541                   MOUNT_FATAL|MOUNT_APPLY_APIVFS_RO|MOUNT_MKDIR },    /* skipped if above was mounted */
542                 { "sysfs",                  "/sys",                         "sysfs", NULL,                             MS_NOSUID|MS_NOEXEC|MS_NODEV,
543                   MOUNT_FATAL|MOUNT_MKDIR },                          /* skipped if above was mounted */
544                 { "tmpfs",                  "/dev",                         "tmpfs", "mode=755" TMPFS_LIMITS_DEV,      MS_NOSUID|MS_STRICTATIME,
545                   MOUNT_FATAL|MOUNT_MKDIR },
546                 { "tmpfs",                  "/dev/shm",                     "tmpfs", "mode=1777" NESTED_TMPFS_LIMITS,  MS_NOSUID|MS_NODEV|MS_STRICTATIME,
547                   MOUNT_FATAL|MOUNT_MKDIR },
548                 { "tmpfs",                  "/run",                         "tmpfs", "mode=755" TMPFS_LIMITS_RUN,      MS_NOSUID|MS_NODEV|MS_STRICTATIME,
549                   MOUNT_FATAL|MOUNT_MKDIR },
550                 { "/run/host",              "/run/host",                    NULL,    NULL,                             MS_BIND,
551                   MOUNT_FATAL|MOUNT_MKDIR|MOUNT_PREFIX_ROOT }, /* Prepare this so that we can make it read-only when we are done */
552                 { "/etc/os-release",        "/run/host/os-release",         NULL,    NULL,                             MS_BIND,
553                   MOUNT_TOUCH }, /* As per kernel interface requirements, bind mount first (creating mount points) and make read-only later */
554                 { "/usr/lib/os-release",    "/run/host/os-release",         NULL,    NULL,                             MS_BIND,
555                   MOUNT_FATAL }, /* If /etc/os-release doesn't exist use the version in /usr/lib as fallback */
556                 { NULL,                     "/run/host/os-release",         NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
557                   MOUNT_FATAL },
558                 { NULL,                     "/run/host",                    NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
559                   MOUNT_FATAL|MOUNT_IN_USERNS },
560 #if HAVE_SELINUX
561                 { "/sys/fs/selinux",        "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND,
562                   MOUNT_MKDIR },  /* Bind mount first (mkdir/chown the mount point in case /sys/ is mounted as minimal skeleton tmpfs) */
563                 { NULL,                     "/sys/fs/selinux",              NULL,    NULL,                             MS_BIND|MS_RDONLY|MS_NOSUID|MS_NOEXEC|MS_NODEV|MS_REMOUNT,
564                   0 },            /* Then, make it r/o (don't mkdir/chown the mount point here, the previous entry already did that) */
565 #endif
566         };
567 
568         bool use_userns = FLAGS_SET(mount_settings, MOUNT_USE_USERNS);
569         bool netns = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_NETNS);
570         bool ro = FLAGS_SET(mount_settings, MOUNT_APPLY_APIVFS_RO);
571         bool in_userns = FLAGS_SET(mount_settings, MOUNT_IN_USERNS);
572         bool tmpfs_tmp = FLAGS_SET(mount_settings, MOUNT_APPLY_TMPFS_TMP);
573         int r;
574 
575         for (size_t k = 0; k < ELEMENTSOF(mount_table); k++) {
576                 _cleanup_free_ char *where = NULL, *options = NULL, *prefixed = NULL;
577                 bool fatal = FLAGS_SET(mount_table[k].mount_settings, MOUNT_FATAL);
578                 const char *o;
579 
580                 if (in_userns != FLAGS_SET(mount_table[k].mount_settings, MOUNT_IN_USERNS))
581                         continue;
582 
583                 if (!netns && FLAGS_SET(mount_table[k].mount_settings, MOUNT_APPLY_APIVFS_NETNS))
584                         continue;
585 
586                 if (!ro && FLAGS_SET(mount_table[k].mount_settings, MOUNT_APPLY_APIVFS_RO))
587                         continue;
588 
589                 if (!tmpfs_tmp && FLAGS_SET(mount_table[k].mount_settings, MOUNT_APPLY_TMPFS_TMP))
590                         continue;
591 
592                 r = chase_symlinks(mount_table[k].where, dest, CHASE_NONEXISTENT|CHASE_PREFIX_ROOT, &where, NULL);
593                 if (r < 0)
594                         return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].where);
595 
596                 /* Skip this entry if it is not a remount. */
597                 if (mount_table[k].what) {
598                         r = path_is_mount_point(where, NULL, 0);
599                         if (r < 0 && r != -ENOENT)
600                                 return log_error_errno(r, "Failed to detect whether %s is a mount point: %m", where);
601                         if (r > 0)
602                                 continue;
603                 }
604 
605                 if ((mount_table[k].mount_settings & (MOUNT_MKDIR|MOUNT_TOUCH)) != 0) {
606                         uid_t u = (use_userns && !in_userns) ? uid_shift : UID_INVALID;
607 
608                         if (FLAGS_SET(mount_table[k].mount_settings, MOUNT_TOUCH))
609                                 r = mkdir_parents_safe(dest, where, 0755, u, u, 0);
610                         else
611                                 r = mkdir_p_safe(dest, where, 0755, u, u, 0);
612                         if (r < 0 && r != -EEXIST) {
613                                 if (fatal && r != -EROFS)
614                                         return log_error_errno(r, "Failed to create directory %s: %m", where);
615 
616                                 log_debug_errno(r, "Failed to create directory %s: %m", where);
617 
618                                 /* If we failed mkdir() or chown() due to the root directory being read only,
619                                  * attempt to mount this fs anyway and let mount_verbose log any errors */
620                                 if (r != -EROFS)
621                                         continue;
622                         }
623                 }
624 
625                 if (FLAGS_SET(mount_table[k].mount_settings, MOUNT_TOUCH)) {
626                         r = touch(where);
627                         if (r < 0 && r != -EEXIST) {
628                                 if (fatal && r != -EROFS)
629                                         return log_error_errno(r, "Failed to create file %s: %m", where);
630 
631                                 log_debug_errno(r, "Failed to create file %s: %m", where);
632                                 if (r != -EROFS)
633                                         continue;
634                         }
635                 }
636 
637                 o = mount_table[k].options;
638                 if (streq_ptr(mount_table[k].type, "tmpfs")) {
639                         r = tmpfs_patch_options(o, in_userns ? 0 : uid_shift, selinux_apifs_context, &options);
640                         if (r < 0)
641                                 return log_oom();
642                         if (r > 0)
643                                 o = options;
644                 }
645 
646                 if (FLAGS_SET(mount_table[k].mount_settings, MOUNT_PREFIX_ROOT)) {
647                         /* Optionally prefix the mount source with the root dir. This is useful in bind
648                          * mounts to be created within the container image before we transition into it. Note
649                          * that MOUNT_IN_USERNS is run after we transitioned hence prefixing is not ncessary
650                          * for those. */
651                         r = chase_symlinks(mount_table[k].what, dest, CHASE_PREFIX_ROOT, &prefixed, NULL);
652                         if (r < 0)
653                                 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, mount_table[k].what);
654                 }
655 
656                 r = mount_verbose_full(
657                                 fatal ? LOG_ERR : LOG_DEBUG,
658                                 prefixed ?: mount_table[k].what,
659                                 where,
660                                 mount_table[k].type,
661                                 mount_table[k].flags,
662                                 o,
663                                 FLAGS_SET(mount_table[k].mount_settings, MOUNT_FOLLOW_SYMLINKS));
664                 if (r < 0 && fatal)
665                         return r;
666         }
667 
668         return 0;
669 }
670 
parse_mount_bind_options(const char * options,unsigned long * mount_flags,char ** mount_opts,bool * idmapped)671 static int parse_mount_bind_options(const char *options, unsigned long *mount_flags, char **mount_opts, bool *idmapped) {
672         unsigned long flags = *mount_flags;
673         char *opts = NULL;
674         bool flag_idmapped = *idmapped;
675         int r;
676 
677         assert(options);
678 
679         for (;;) {
680                 _cleanup_free_ char *word = NULL;
681 
682                 r = extract_first_word(&options, &word, ",", 0);
683                 if (r < 0)
684                         return log_error_errno(r, "Failed to extract mount option: %m");
685                 if (r == 0)
686                         break;
687 
688                 if (streq(word, "rbind"))
689                         flags |= MS_REC;
690                 else if (streq(word, "norbind"))
691                         flags &= ~MS_REC;
692                 else if (streq(word, "idmap"))
693                         flag_idmapped = true;
694                 else if (streq(word, "noidmap"))
695                         flag_idmapped = false;
696                 else
697                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
698                                                "Invalid bind mount option: %s", word);
699         }
700 
701         *mount_flags = flags;
702         *idmapped = flag_idmapped;
703         /* in the future mount_opts will hold string options for mount(2) */
704         *mount_opts = opts;
705 
706         return 0;
707 }
708 
mount_bind(const char * dest,CustomMount * m,uid_t uid_shift,uid_t uid_range)709 static int mount_bind(const char *dest, CustomMount *m, uid_t uid_shift, uid_t uid_range) {
710         _cleanup_free_ char *mount_opts = NULL, *where = NULL;
711         unsigned long mount_flags = MS_BIND | MS_REC;
712         struct stat source_st, dest_st;
713         int r;
714         bool idmapped = false;
715 
716         assert(dest);
717         assert(m);
718 
719         if (m->options) {
720                 r = parse_mount_bind_options(m->options, &mount_flags, &mount_opts, &idmapped);
721                 if (r < 0)
722                         return r;
723         }
724 
725         /* If this is a bind mount from a temporary sources change ownership of the source to the container's
726          * root UID. Otherwise it would always show up as "nobody" if user namespacing is used. */
727         if (m->rm_rf_tmpdir && chown(m->source, uid_shift, uid_shift) < 0)
728                 return log_error_errno(errno, "Failed to chown %s: %m", m->source);
729 
730         if (stat(m->source, &source_st) < 0)
731                 return log_error_errno(errno, "Failed to stat %s: %m", m->source);
732 
733         r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
734         if (r < 0)
735                 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
736         if (r > 0) { /* Path exists already? */
737 
738                 if (stat(where, &dest_st) < 0)
739                         return log_error_errno(errno, "Failed to stat %s: %m", where);
740 
741                 if (S_ISDIR(source_st.st_mode) && !S_ISDIR(dest_st.st_mode))
742                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
743                                                "Cannot bind mount directory %s on file %s.",
744                                                m->source, where);
745 
746                 if (!S_ISDIR(source_st.st_mode) && S_ISDIR(dest_st.st_mode))
747                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
748                                                "Cannot bind mount file %s on directory %s.",
749                                                m->source, where);
750 
751         } else { /* Path doesn't exist yet? */
752                 r = mkdir_parents_label(where, 0755);
753                 if (r < 0)
754                         return log_error_errno(r, "Failed to make parents of %s: %m", where);
755 
756                 /* Create the mount point. Any non-directory file can be
757                 * mounted on any non-directory file (regular, fifo, socket,
758                 * char, block).
759                 */
760                 if (S_ISDIR(source_st.st_mode))
761                         r = mkdir_label(where, 0755);
762                 else
763                         r = touch(where);
764                 if (r < 0)
765                         return log_error_errno(r, "Failed to create mount point %s: %m", where);
766         }
767 
768         r = mount_nofollow_verbose(LOG_ERR, m->source, where, NULL, mount_flags, mount_opts);
769         if (r < 0)
770                 return r;
771 
772         if (m->read_only) {
773                 r = bind_remount_recursive(where, MS_RDONLY, MS_RDONLY, NULL);
774                 if (r < 0)
775                         return log_error_errno(r, "Read-only bind mount failed: %m");
776         }
777 
778         if (idmapped) {
779                 r = remount_idmap(where, uid_shift, uid_range, REMOUNT_IDMAP_HOST_ROOT);
780                 if (r < 0)
781                         return log_error_errno(r, "Failed to map ids for bind mount %s: %m", where);
782         }
783 
784         return 0;
785 }
786 
mount_tmpfs(const char * dest,CustomMount * m,uid_t uid_shift,const char * selinux_apifs_context)787 static int mount_tmpfs(const char *dest, CustomMount *m, uid_t uid_shift, const char *selinux_apifs_context) {
788         const char *options;
789         _cleanup_free_ char *buf = NULL, *where = NULL;
790         int r;
791 
792         assert(dest);
793         assert(m);
794 
795         r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
796         if (r < 0)
797                 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
798         if (r == 0) { /* Doesn't exist yet? */
799                 r = mkdir_p_label(where, 0755);
800                 if (r < 0)
801                         return log_error_errno(r, "Creating mount point for tmpfs %s failed: %m", where);
802         }
803 
804         r = tmpfs_patch_options(m->options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
805         if (r < 0)
806                 return log_oom();
807         options = r > 0 ? buf : m->options;
808 
809         return mount_nofollow_verbose(LOG_ERR, "tmpfs", where, "tmpfs", MS_NODEV|MS_STRICTATIME, options);
810 }
811 
joined_and_escaped_lower_dirs(char ** lower)812 static char *joined_and_escaped_lower_dirs(char **lower) {
813         _cleanup_strv_free_ char **sv = NULL;
814 
815         sv = strv_copy(lower);
816         if (!sv)
817                 return NULL;
818 
819         strv_reverse(sv);
820 
821         if (!strv_shell_escape(sv, ",:"))
822                 return NULL;
823 
824         return strv_join(sv, ":");
825 }
826 
mount_overlay(const char * dest,CustomMount * m)827 static int mount_overlay(const char *dest, CustomMount *m) {
828         _cleanup_free_ char *lower = NULL, *where = NULL, *escaped_source = NULL;
829         const char *options;
830         int r;
831 
832         assert(dest);
833         assert(m);
834 
835         r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
836         if (r < 0)
837                 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
838         if (r == 0) { /* Doesn't exist yet? */
839                 r = mkdir_label(where, 0755);
840                 if (r < 0)
841                         return log_error_errno(r, "Creating mount point for overlay %s failed: %m", where);
842         }
843 
844         (void) mkdir_p_label(m->source, 0755);
845 
846         lower = joined_and_escaped_lower_dirs(m->lower);
847         if (!lower)
848                 return log_oom();
849 
850         escaped_source = shell_escape(m->source, ",:");
851         if (!escaped_source)
852                 return log_oom();
853 
854         if (m->read_only)
855                 options = strjoina("lowerdir=", escaped_source, ":", lower);
856         else {
857                 _cleanup_free_ char *escaped_work_dir = NULL;
858 
859                 escaped_work_dir = shell_escape(m->work_dir, ",:");
860                 if (!escaped_work_dir)
861                         return log_oom();
862 
863                 options = strjoina("lowerdir=", lower, ",upperdir=", escaped_source, ",workdir=", escaped_work_dir);
864         }
865 
866         return mount_nofollow_verbose(LOG_ERR, "overlay", where, "overlay", m->read_only ? MS_RDONLY : 0, options);
867 }
868 
mount_inaccessible(const char * dest,CustomMount * m)869 static int mount_inaccessible(const char *dest, CustomMount *m) {
870         _cleanup_free_ char *where = NULL, *source = NULL;
871         struct stat st;
872         int r;
873 
874         assert(dest);
875         assert(m);
876 
877         r = chase_symlinks_and_stat(m->destination, dest, CHASE_PREFIX_ROOT, &where, &st, NULL);
878         if (r < 0) {
879                 log_full_errno(m->graceful ? LOG_DEBUG : LOG_ERR, r, "Failed to resolve %s/%s: %m", dest, m->destination);
880                 return m->graceful ? 0 : r;
881         }
882 
883         r = mode_to_inaccessible_node(NULL, st.st_mode, &source);
884         if (r < 0)
885                 return m->graceful ? 0 : r;
886 
887         r = mount_nofollow_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, source, where, NULL, MS_BIND, NULL);
888         if (r < 0)
889                 return m->graceful ? 0 : r;
890 
891         r = mount_nofollow_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, NULL, where, NULL, MS_BIND|MS_RDONLY|MS_REMOUNT, NULL);
892         if (r < 0) {
893                 (void) umount_verbose(m->graceful ? LOG_DEBUG : LOG_ERR, where, UMOUNT_NOFOLLOW);
894                 return m->graceful ? 0 : r;
895         }
896 
897         return 0;
898 }
899 
mount_arbitrary(const char * dest,CustomMount * m)900 static int mount_arbitrary(const char *dest, CustomMount *m) {
901         _cleanup_free_ char *where = NULL;
902         int r;
903 
904         assert(dest);
905         assert(m);
906 
907         r = chase_symlinks(m->destination, dest, CHASE_PREFIX_ROOT|CHASE_NONEXISTENT, &where, NULL);
908         if (r < 0)
909                 return log_error_errno(r, "Failed to resolve %s/%s: %m", dest, m->destination);
910         if (r == 0) { /* Doesn't exist yet? */
911                 r = mkdir_p_label(where, 0755);
912                 if (r < 0)
913                         return log_error_errno(r, "Creating mount point for mount %s failed: %m", where);
914         }
915 
916         return mount_nofollow_verbose(LOG_ERR, m->source, where, m->type_argument, 0, m->options);
917 }
918 
mount_custom(const char * dest,CustomMount * mounts,size_t n,uid_t uid_shift,uid_t uid_range,const char * selinux_apifs_context,MountSettingsMask mount_settings)919 int mount_custom(
920                 const char *dest,
921                 CustomMount *mounts, size_t n,
922                 uid_t uid_shift,
923                 uid_t uid_range,
924                 const char *selinux_apifs_context,
925                 MountSettingsMask mount_settings) {
926         int r;
927 
928         assert(dest);
929 
930         for (size_t i = 0; i < n; i++) {
931                 CustomMount *m = mounts + i;
932 
933                 if (FLAGS_SET(mount_settings, MOUNT_IN_USERNS) != m->in_userns)
934                         continue;
935 
936                 if (FLAGS_SET(mount_settings, MOUNT_ROOT_ONLY) && !path_equal(m->destination, "/"))
937                         continue;
938 
939                 if (FLAGS_SET(mount_settings, MOUNT_NON_ROOT_ONLY) && path_equal(m->destination, "/"))
940                         continue;
941 
942                 switch (m->type) {
943 
944                 case CUSTOM_MOUNT_BIND:
945                         r = mount_bind(dest, m, uid_shift, uid_range);
946                         break;
947 
948                 case CUSTOM_MOUNT_TMPFS:
949                         r = mount_tmpfs(dest, m, uid_shift, selinux_apifs_context);
950                         break;
951 
952                 case CUSTOM_MOUNT_OVERLAY:
953                         r = mount_overlay(dest, m);
954                         break;
955 
956                 case CUSTOM_MOUNT_INACCESSIBLE:
957                         r = mount_inaccessible(dest, m);
958                         break;
959 
960                 case CUSTOM_MOUNT_ARBITRARY:
961                         r = mount_arbitrary(dest, m);
962                         break;
963 
964                 default:
965                         assert_not_reached();
966                 }
967 
968                 if (r < 0)
969                         return r;
970         }
971 
972         return 0;
973 }
974 
has_custom_root_mount(const CustomMount * mounts,size_t n)975 bool has_custom_root_mount(const CustomMount *mounts, size_t n) {
976         for (size_t i = 0; i < n; i++)
977                 if (path_equal(mounts[i].destination, "/"))
978                         return true;
979 
980         return false;
981 }
982 
setup_volatile_state(const char * directory,uid_t uid_shift,const char * selinux_apifs_context)983 static int setup_volatile_state(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
984         _cleanup_free_ char *buf = NULL;
985         const char *p, *options;
986         int r;
987 
988         assert(directory);
989 
990         /* --volatile=state means we simply overmount /var with a tmpfs, and the rest read-only. */
991 
992         r = bind_remount_recursive(directory, MS_RDONLY, MS_RDONLY, NULL);
993         if (r < 0)
994                 return log_error_errno(r, "Failed to remount %s read-only: %m", directory);
995 
996         p = prefix_roota(directory, "/var");
997         r = mkdir(p, 0755);
998         if (r < 0 && errno != EEXIST)
999                 return log_error_errno(errno, "Failed to create %s: %m", directory);
1000 
1001         options = "mode=755" TMPFS_LIMITS_VOLATILE_STATE;
1002         r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1003         if (r < 0)
1004                 return log_oom();
1005         if (r > 0)
1006                 options = buf;
1007 
1008         return mount_nofollow_verbose(LOG_ERR, "tmpfs", p, "tmpfs", MS_STRICTATIME, options);
1009 }
1010 
setup_volatile_yes(const char * directory,uid_t uid_shift,const char * selinux_apifs_context)1011 static int setup_volatile_yes(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
1012         bool tmpfs_mounted = false, bind_mounted = false;
1013         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1014         _cleanup_free_ char *buf = NULL, *bindir = NULL;
1015         const char *f, *t, *options;
1016         struct stat st;
1017         int r;
1018 
1019         assert(directory);
1020 
1021         /* --volatile=yes means we mount a tmpfs to the root dir, and the original /usr to use inside it, and
1022          * that read-only. Before we start setting this up let's validate if the image has the /usr merge
1023          * implemented, and let's output a friendly log message if it hasn't. */
1024 
1025         bindir = path_join(directory, "/bin");
1026         if (!bindir)
1027                 return log_oom();
1028         if (lstat(bindir, &st) < 0) {
1029                 if (errno != ENOENT)
1030                         return log_error_errno(errno, "Failed to stat /bin directory below image: %m");
1031 
1032                 /* ENOENT is fine, just means the image is probably just a naked /usr and we can create the
1033                  * rest. */
1034         } else if (S_ISDIR(st.st_mode))
1035                 return log_error_errno(SYNTHETIC_ERRNO(EISDIR),
1036                                        "Sorry, --volatile=yes mode is not supported with OS images that have not merged /bin/, /sbin/, /lib/, /lib64/ into /usr/. "
1037                                        "Please work with your distribution and help them adopt the merged /usr scheme.");
1038         else if (!S_ISLNK(st.st_mode))
1039                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1040                                        "Error starting image: if --volatile=yes is used /bin must be a symlink (for merged /usr support) or non-existent (in which case a symlink is created automatically).");
1041 
1042         if (!mkdtemp(template))
1043                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1044 
1045         options = "mode=755" TMPFS_LIMITS_ROOTFS;
1046         r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1047         if (r < 0)
1048                 goto fail;
1049         if (r > 0)
1050                 options = buf;
1051 
1052         r = mount_nofollow_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1053         if (r < 0)
1054                 goto fail;
1055 
1056         tmpfs_mounted = true;
1057 
1058         f = prefix_roota(directory, "/usr");
1059         t = prefix_roota(template, "/usr");
1060 
1061         r = mkdir(t, 0755);
1062         if (r < 0 && errno != EEXIST) {
1063                 r = log_error_errno(errno, "Failed to create %s: %m", t);
1064                 goto fail;
1065         }
1066 
1067         r = mount_nofollow_verbose(LOG_ERR, f, t, NULL, MS_BIND|MS_REC, NULL);
1068         if (r < 0)
1069                 goto fail;
1070 
1071         bind_mounted = true;
1072 
1073         r = bind_remount_recursive(t, MS_RDONLY, MS_RDONLY, NULL);
1074         if (r < 0) {
1075                 log_error_errno(r, "Failed to remount %s read-only: %m", t);
1076                 goto fail;
1077         }
1078 
1079         r = mount_nofollow_verbose(LOG_ERR, template, directory, NULL, MS_MOVE, NULL);
1080         if (r < 0)
1081                 goto fail;
1082 
1083         (void) rmdir(template);
1084 
1085         return 0;
1086 
1087 fail:
1088         if (bind_mounted)
1089                 (void) umount_verbose(LOG_ERR, t, UMOUNT_NOFOLLOW);
1090 
1091         if (tmpfs_mounted)
1092                 (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW);
1093 
1094         (void) rmdir(template);
1095         return r;
1096 }
1097 
setup_volatile_overlay(const char * directory,uid_t uid_shift,const char * selinux_apifs_context)1098 static int setup_volatile_overlay(const char *directory, uid_t uid_shift, const char *selinux_apifs_context) {
1099         _cleanup_free_ char *buf = NULL, *escaped_directory = NULL, *escaped_upper = NULL, *escaped_work = NULL;
1100         char template[] = "/tmp/nspawn-volatile-XXXXXX";
1101         const char *upper, *work, *options;
1102         bool tmpfs_mounted = false;
1103         int r;
1104 
1105         assert(directory);
1106 
1107         /* --volatile=overlay means we mount an overlayfs to the root dir. */
1108 
1109         if (!mkdtemp(template))
1110                 return log_error_errno(errno, "Failed to create temporary directory: %m");
1111 
1112         options = "mode=755" TMPFS_LIMITS_ROOTFS;
1113         r = tmpfs_patch_options(options, uid_shift == 0 ? UID_INVALID : uid_shift, selinux_apifs_context, &buf);
1114         if (r < 0)
1115                 goto finish;
1116         if (r > 0)
1117                 options = buf;
1118 
1119         r = mount_nofollow_verbose(LOG_ERR, "tmpfs", template, "tmpfs", MS_STRICTATIME, options);
1120         if (r < 0)
1121                 goto finish;
1122 
1123         tmpfs_mounted = true;
1124 
1125         upper = strjoina(template, "/upper");
1126         work = strjoina(template, "/work");
1127 
1128         if (mkdir(upper, 0755) < 0) {
1129                 r = log_error_errno(errno, "Failed to create %s: %m", upper);
1130                 goto finish;
1131         }
1132         if (mkdir(work, 0755) < 0) {
1133                 r = log_error_errno(errno, "Failed to create %s: %m", work);
1134                 goto finish;
1135         }
1136 
1137         /* And now, let's overmount the root dir with an overlayfs that uses the root dir as lower dir. It's kinda nice
1138          * that the kernel allows us to do that without going through some mount point rearrangements. */
1139 
1140         escaped_directory = shell_escape(directory, ",:");
1141         escaped_upper = shell_escape(upper, ",:");
1142         escaped_work = shell_escape(work, ",:");
1143         if (!escaped_directory || !escaped_upper || !escaped_work) {
1144                 r = -ENOMEM;
1145                 goto finish;
1146         }
1147 
1148         options = strjoina("lowerdir=", escaped_directory, ",upperdir=", escaped_upper, ",workdir=", escaped_work);
1149         r = mount_nofollow_verbose(LOG_ERR, "overlay", directory, "overlay", 0, options);
1150 
1151 finish:
1152         if (tmpfs_mounted)
1153                 (void) umount_verbose(LOG_ERR, template, UMOUNT_NOFOLLOW);
1154 
1155         (void) rmdir(template);
1156         return r;
1157 }
1158 
setup_volatile_mode(const char * directory,VolatileMode mode,uid_t uid_shift,const char * selinux_apifs_context)1159 int setup_volatile_mode(
1160                 const char *directory,
1161                 VolatileMode mode,
1162                 uid_t uid_shift,
1163                 const char *selinux_apifs_context) {
1164 
1165         switch (mode) {
1166 
1167         case VOLATILE_YES:
1168                 return setup_volatile_yes(directory, uid_shift, selinux_apifs_context);
1169 
1170         case VOLATILE_STATE:
1171                 return setup_volatile_state(directory, uid_shift, selinux_apifs_context);
1172 
1173         case VOLATILE_OVERLAY:
1174                 return setup_volatile_overlay(directory, uid_shift, selinux_apifs_context);
1175 
1176         default:
1177                 return 0;
1178         }
1179 }
1180 
1181 /* Expects *pivot_root_new and *pivot_root_old to be initialised to allocated memory or NULL. */
pivot_root_parse(char ** pivot_root_new,char ** pivot_root_old,const char * s)1182 int pivot_root_parse(char **pivot_root_new, char **pivot_root_old, const char *s) {
1183         _cleanup_free_ char *root_new = NULL, *root_old = NULL;
1184         const char *p = s;
1185         int r;
1186 
1187         assert(pivot_root_new);
1188         assert(pivot_root_old);
1189 
1190         r = extract_first_word(&p, &root_new, ":", EXTRACT_DONT_COALESCE_SEPARATORS);
1191         if (r < 0)
1192                 return r;
1193         if (r == 0)
1194                 return -EINVAL;
1195 
1196         if (isempty(p))
1197                 root_old = NULL;
1198         else {
1199                 root_old = strdup(p);
1200                 if (!root_old)
1201                         return -ENOMEM;
1202         }
1203 
1204         if (!path_is_absolute(root_new))
1205                 return -EINVAL;
1206         if (root_old && !path_is_absolute(root_old))
1207                 return -EINVAL;
1208 
1209         free_and_replace(*pivot_root_new, root_new);
1210         free_and_replace(*pivot_root_old, root_old);
1211 
1212         return 0;
1213 }
1214 
setup_pivot_root(const char * directory,const char * pivot_root_new,const char * pivot_root_old)1215 int setup_pivot_root(const char *directory, const char *pivot_root_new, const char *pivot_root_old) {
1216         _cleanup_free_ char *directory_pivot_root_new = NULL;
1217         _cleanup_free_ char *pivot_tmp_pivot_root_old = NULL;
1218         char pivot_tmp[] = "/tmp/nspawn-pivot-XXXXXX";
1219         bool remove_pivot_tmp = false;
1220         int r;
1221 
1222         assert(directory);
1223 
1224         if (!pivot_root_new)
1225                 return 0;
1226 
1227         /* Pivot pivot_root_new to / and the existing / to pivot_root_old.
1228          * If pivot_root_old is NULL, the existing / disappears.
1229          * This requires a temporary directory, pivot_tmp, which is
1230          * not a child of either.
1231          *
1232          * This is typically used for OSTree-style containers, where
1233          * the root partition contains several sysroots which could be
1234          * run. Normally, one would be chosen by the bootloader and
1235          * pivoted to / by initramfs.
1236          *
1237          * For example, for an OSTree deployment, pivot_root_new
1238          * would be: /ostree/deploy/$os/deploy/$checksum. Note that this
1239          * code doesn’t do the /var mount which OSTree expects: use
1240          * --bind +/sysroot/ostree/deploy/$os/var:/var for that.
1241          *
1242          * So in the OSTree case, we’ll end up with something like:
1243          *  - directory = /tmp/nspawn-root-123456
1244          *  - pivot_root_new = /ostree/deploy/os/deploy/123abc
1245          *  - pivot_root_old = /sysroot
1246          *  - directory_pivot_root_new =
1247          *       /tmp/nspawn-root-123456/ostree/deploy/os/deploy/123abc
1248          *  - pivot_tmp = /tmp/nspawn-pivot-123456
1249          *  - pivot_tmp_pivot_root_old = /tmp/nspawn-pivot-123456/sysroot
1250          *
1251          * Requires all file systems at directory and below to be mounted
1252          * MS_PRIVATE or MS_SLAVE so they can be moved.
1253          */
1254         directory_pivot_root_new = path_join(directory, pivot_root_new);
1255         if (!directory_pivot_root_new)
1256                 return log_oom();
1257 
1258         /* Remount directory_pivot_root_new to make it movable. */
1259         r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory_pivot_root_new, NULL, MS_BIND, NULL);
1260         if (r < 0)
1261                 goto done;
1262 
1263         if (pivot_root_old) {
1264                 if (!mkdtemp(pivot_tmp)) {
1265                         r = log_error_errno(errno, "Failed to create temporary directory: %m");
1266                         goto done;
1267                 }
1268 
1269                 remove_pivot_tmp = true;
1270                 pivot_tmp_pivot_root_old = path_join(pivot_tmp, pivot_root_old);
1271                 if (!pivot_tmp_pivot_root_old) {
1272                         r = log_oom();
1273                         goto done;
1274                 }
1275 
1276                 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, pivot_tmp, NULL, MS_MOVE, NULL);
1277                 if (r < 0)
1278                         goto done;
1279 
1280                 r = mount_nofollow_verbose(LOG_ERR, directory, pivot_tmp_pivot_root_old, NULL, MS_MOVE, NULL);
1281                 if (r < 0)
1282                         goto done;
1283 
1284                 r = mount_nofollow_verbose(LOG_ERR, pivot_tmp, directory, NULL, MS_MOVE, NULL);
1285                 if (r < 0)
1286                         goto done;
1287         } else {
1288                 r = mount_nofollow_verbose(LOG_ERR, directory_pivot_root_new, directory, NULL, MS_MOVE, NULL);
1289                 if (r < 0)
1290                         goto done;
1291         }
1292 
1293 done:
1294         if (remove_pivot_tmp)
1295                 (void) rmdir(pivot_tmp);
1296 
1297         return r;
1298 }
1299