1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <errno.h>
4 #include <stdlib.h>
5 #include <sys/mount.h>
6 #include <sys/stat.h>
7 #include <sys/statvfs.h>
8 #include <unistd.h>
9 #include <linux/loop.h>
10 #include <linux/fs.h>
11 
12 #include "alloc-util.h"
13 #include "chase-symlinks.h"
14 #include "dissect-image.h"
15 #include "exec-util.h"
16 #include "extract-word.h"
17 #include "fd-util.h"
18 #include "fileio.h"
19 #include "fs-util.h"
20 #include "hashmap.h"
21 #include "label.h"
22 #include "libmount-util.h"
23 #include "missing_mount.h"
24 #include "missing_syscall.h"
25 #include "mkdir-label.h"
26 #include "mount-util.h"
27 #include "mountpoint-util.h"
28 #include "namespace-util.h"
29 #include "parse-util.h"
30 #include "path-util.h"
31 #include "process-util.h"
32 #include "set.h"
33 #include "stat-util.h"
34 #include "stdio-util.h"
35 #include "string-util.h"
36 #include "strv.h"
37 #include "tmpfile-util.h"
38 #include "user-util.h"
39 
mount_fd(const char * source,int target_fd,const char * filesystemtype,unsigned long mountflags,const void * data)40 int mount_fd(const char *source,
41              int target_fd,
42              const char *filesystemtype,
43              unsigned long mountflags,
44              const void *data) {
45 
46         if (mount(source, FORMAT_PROC_FD_PATH(target_fd), filesystemtype, mountflags, data) < 0) {
47                 if (errno != ENOENT)
48                         return -errno;
49 
50                 /* ENOENT can mean two things: either that the source is missing, or that /proc/ isn't
51                  * mounted. Check for the latter to generate better error messages. */
52                 if (proc_mounted() == 0)
53                         return -ENOSYS;
54 
55                 return -ENOENT;
56         }
57 
58         return 0;
59 }
60 
mount_nofollow(const char * source,const char * target,const char * filesystemtype,unsigned long mountflags,const void * data)61 int mount_nofollow(
62                 const char *source,
63                 const char *target,
64                 const char *filesystemtype,
65                 unsigned long mountflags,
66                 const void *data) {
67 
68         _cleanup_close_ int fd = -1;
69 
70         /* In almost all cases we want to manipulate the mount table without following symlinks, hence
71          * mount_nofollow() is usually the way to go. The only exceptions are environments where /proc/ is
72          * not available yet, since we need /proc/self/fd/ for this logic to work. i.e. during the early
73          * initialization of namespacing/container stuff where /proc is not yet mounted (and maybe even the
74          * fs to mount) we can only use traditional mount() directly.
75          *
76          * Note that this disables following only for the final component of the target, i.e symlinks within
77          * the path of the target are honoured, as are symlinks in the source path everywhere. */
78 
79         fd = open(target, O_PATH|O_CLOEXEC|O_NOFOLLOW);
80         if (fd < 0)
81                 return -errno;
82 
83         return mount_fd(source, fd, filesystemtype, mountflags, data);
84 }
85 
umount_recursive(const char * prefix,int flags)86 int umount_recursive(const char *prefix, int flags) {
87         int n = 0, r;
88         bool again;
89 
90         /* Try to umount everything recursively below a directory. Also, take care of stacked mounts, and
91          * keep unmounting them until they are gone. */
92 
93         do {
94                 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
95                 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
96 
97                 again = false;
98 
99                 r = libmount_parse("/proc/self/mountinfo", NULL, &table, &iter);
100                 if (r < 0)
101                         return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
102 
103                 for (;;) {
104                         struct libmnt_fs *fs;
105                         const char *path;
106 
107                         r = mnt_table_next_fs(table, iter, &fs);
108                         if (r == 1)
109                                 break;
110                         if (r < 0)
111                                 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
112 
113                         path = mnt_fs_get_target(fs);
114                         if (!path)
115                                 continue;
116 
117                         if (!path_startswith(path, prefix))
118                                 continue;
119 
120                         if (umount2(path, flags | UMOUNT_NOFOLLOW) < 0) {
121                                 log_debug_errno(errno, "Failed to umount %s, ignoring: %m", path);
122                                 continue;
123                         }
124 
125                         log_debug("Successfully unmounted %s", path);
126 
127                         again = true;
128                         n++;
129 
130                         break;
131                 }
132         } while (again);
133 
134         return n;
135 }
136 
137 #define MS_CONVERTIBLE_FLAGS (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_NOSYMFOLLOW)
138 
ms_flags_to_mount_attr(unsigned long a)139 static uint64_t ms_flags_to_mount_attr(unsigned long a) {
140         uint64_t f = 0;
141 
142         if (FLAGS_SET(a, MS_RDONLY))
143                 f |= MOUNT_ATTR_RDONLY;
144 
145         if (FLAGS_SET(a, MS_NOSUID))
146                 f |= MOUNT_ATTR_NOSUID;
147 
148         if (FLAGS_SET(a, MS_NODEV))
149                 f |= MOUNT_ATTR_NODEV;
150 
151         if (FLAGS_SET(a, MS_NOEXEC))
152                 f |= MOUNT_ATTR_NOEXEC;
153 
154         if (FLAGS_SET(a, MS_NOSYMFOLLOW))
155                 f |= MOUNT_ATTR_NOSYMFOLLOW;
156 
157         return f;
158 }
159 
160 static bool skip_mount_set_attr = false;
161 
162 /* Use this function only if you do not have direct access to /proc/self/mountinfo but the caller can open it
163  * for you. This is the case when /proc is masked or not mounted. Otherwise, use bind_remount_recursive. */
bind_remount_recursive_with_mountinfo(const char * prefix,unsigned long new_flags,unsigned long flags_mask,char ** deny_list,FILE * proc_self_mountinfo)164 int bind_remount_recursive_with_mountinfo(
165                 const char *prefix,
166                 unsigned long new_flags,
167                 unsigned long flags_mask,
168                 char **deny_list,
169                 FILE *proc_self_mountinfo) {
170 
171         _cleanup_fclose_ FILE *proc_self_mountinfo_opened = NULL;
172         _cleanup_set_free_ Set *done = NULL;
173         unsigned n_tries = 0;
174         int r;
175 
176         assert(prefix);
177 
178         if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && strv_isempty(deny_list) && !skip_mount_set_attr) {
179                 /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
180 
181                 if (mount_setattr(AT_FDCWD, prefix, AT_SYMLINK_NOFOLLOW|AT_RECURSIVE,
182                                   &(struct mount_attr) {
183                                           .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
184                                           .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
185                                   }, MOUNT_ATTR_SIZE_VER0) < 0) {
186 
187                         log_debug_errno(errno, "mount_setattr() failed, falling back to classic remounting: %m");
188 
189                         /* We fall through to classic behaviour if not supported (i.e. kernel < 5.12). We
190                          * also do this for all other kinds of errors since they are so many different, and
191                          * mount_setattr() has no graceful mode where it continues despite seeing errors one
192                          * some mounts, but we want that. Moreover mount_setattr() only works on the mount
193                          * point inode itself, not a non-mount point inode, and we want to support arbitrary
194                          * prefixes here. */
195 
196                         if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
197                                 skip_mount_set_attr = true;
198                 } else
199                         return 0; /* Nice, this worked! */
200         }
201 
202         if (!proc_self_mountinfo) {
203                 r = fopen_unlocked("/proc/self/mountinfo", "re", &proc_self_mountinfo_opened);
204                 if (r < 0)
205                         return r;
206 
207                 proc_self_mountinfo = proc_self_mountinfo_opened;
208         }
209 
210         /* Recursively remount a directory (and all its submounts) with desired flags (MS_READONLY,
211          * MS_NOSUID, MS_NOEXEC). If the directory is already mounted, we reuse the mount and simply mark it
212          * MS_BIND|MS_RDONLY (or remove the MS_RDONLY for read-write operation), ditto for other flags. If it
213          * isn't we first make it one. Afterwards we apply (or remove) the flags to all submounts we can
214          * access, too. When mounts are stacked on the same mount point we only care for each individual
215          * "top-level" mount on each point, as we cannot influence/access the underlying mounts anyway. We do
216          * not have any effect on future submounts that might get propagated, they might be writable
217          * etc. This includes future submounts that have been triggered via autofs. Also note that we can't
218          * operate atomically here. Mounts established while we process the tree might or might not get
219          * noticed and thus might or might not be covered.
220          *
221          * If the "deny_list" parameter is specified it may contain a list of subtrees to exclude from the
222          * remount operation. Note that we'll ignore the deny list for the top-level path. */
223 
224         for (;;) {
225                 _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
226                 _cleanup_(mnt_free_iterp) struct libmnt_iter *iter = NULL;
227                 _cleanup_hashmap_free_ Hashmap *todo = NULL;
228                 bool top_autofs = false;
229 
230                 if (n_tries++ >= 32) /* Let's not retry this loop forever */
231                         return -EBUSY;
232 
233                 rewind(proc_self_mountinfo);
234 
235                 r = libmount_parse("/proc/self/mountinfo", proc_self_mountinfo, &table, &iter);
236                 if (r < 0)
237                         return log_debug_errno(r, "Failed to parse /proc/self/mountinfo: %m");
238 
239                 for (;;) {
240                         _cleanup_free_ char *d = NULL;
241                         const char *path, *type, *opts;
242                         unsigned long flags = 0;
243                         struct libmnt_fs *fs;
244 
245                         r = mnt_table_next_fs(table, iter, &fs);
246                         if (r == 1) /* EOF */
247                                 break;
248                         if (r < 0)
249                                 return log_debug_errno(r, "Failed to get next entry from /proc/self/mountinfo: %m");
250 
251                         path = mnt_fs_get_target(fs);
252                         if (!path)
253                                 continue;
254 
255                         if (!path_startswith(path, prefix))
256                                 continue;
257 
258                         type = mnt_fs_get_fstype(fs);
259                         if (!type)
260                                 continue;
261 
262                         /* Let's ignore autofs mounts. If they aren't triggered yet, we want to avoid
263                          * triggering them, as we don't make any guarantees for future submounts anyway. If
264                          * they are already triggered, then we will find another entry for this. */
265                         if (streq(type, "autofs")) {
266                                 top_autofs = top_autofs || path_equal(path, prefix);
267                                 continue;
268                         }
269 
270                         if (set_contains(done, path))
271                                 continue;
272 
273                         /* Ignore this mount if it is deny-listed, but only if it isn't the top-level mount
274                          * we shall operate on. */
275                         if (!path_equal(path, prefix)) {
276                                 bool deny_listed = false;
277 
278                                 STRV_FOREACH(i, deny_list) {
279                                         if (path_equal(*i, prefix))
280                                                 continue;
281 
282                                         if (!path_startswith(*i, prefix))
283                                                 continue;
284 
285                                         if (path_startswith(path, *i)) {
286                                                 deny_listed = true;
287                                                 log_debug("Not remounting %s deny-listed by %s, called for %s", path, *i, prefix);
288                                                 break;
289                                         }
290                                 }
291 
292                                 if (deny_listed)
293                                         continue;
294                         }
295 
296                         opts = mnt_fs_get_vfs_options(fs);
297                         if (opts) {
298                                 r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
299                                 if (r < 0)
300                                         log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
301                         }
302 
303                         d = strdup(path);
304                         if (!d)
305                                 return -ENOMEM;
306 
307                         r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags));
308                         if (r == -EEXIST)
309                                 /* If the same path was recorded, but with different mount flags, update it:
310                                  * it means a mount point is overmounted, and libmount returns the "bottom" (or
311                                  * older one) first, but we want to reapply the flags from the "top" (or newer
312                                  * one). See: https://github.com/systemd/systemd/issues/20032
313                                  * Note that this shouldn't really fail, as we were just told that the key
314                                  * exists, and it's an update so we want 'd' to be freed immediately. */
315                                 r = hashmap_update(todo, d, ULONG_TO_PTR(flags));
316                         if (r < 0)
317                                 return r;
318                         if (r > 0)
319                                 TAKE_PTR(d);
320                 }
321 
322                 /* Check if the top-level directory was among what we have seen so far. For that check both
323                  * 'done' and 'todo'. Also check 'top_autofs' because if the top-level dir is an autofs we'll
324                  * not include it in either set but will set this bool. */
325                 if (!set_contains(done, prefix) &&
326                     !(top_autofs || hashmap_contains(todo, prefix))) {
327 
328                         /* The prefix directory itself is not yet a mount, make it one. */
329                         r = mount_nofollow(prefix, prefix, NULL, MS_BIND|MS_REC, NULL);
330                         if (r < 0)
331                                 return r;
332 
333                         /* Immediately rescan, so that we pick up the new mount's flags */
334                         continue;
335                 }
336 
337                 /* If we have no submounts to process anymore, we are done */
338                 if (hashmap_isempty(todo))
339                         return 0;
340 
341                 for (;;) {
342                         unsigned long flags;
343                         char *x = NULL;
344 
345                         /* Take the first mount from our list of mounts to still process */
346                         flags = PTR_TO_ULONG(hashmap_steal_first_key_and_value(todo, (void**) &x));
347                         if (!x)
348                                 break;
349 
350                         r = set_ensure_consume(&done, &path_hash_ops_free, x);
351                         if (IN_SET(r, 0, -EEXIST))
352                                 continue; /* Already done */
353                         if (r < 0)
354                                 return r;
355 
356                         /* Now, remount this with the new flags set, but exclude MS_RELATIME from it. (It's
357                          * the default anyway, thus redundant, and in userns we'll get an error if we try to
358                          * explicitly enable it) */
359                         r = mount_nofollow(NULL, x, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
360                         if (r < 0) {
361                                 int q;
362 
363                                 /* OK, so the remount of this entry failed. We'll ultimately ignore this in
364                                  * almost all cases (there are simply so many reasons why this can fail,
365                                  * think autofs, NFS, FUSE, …), but let's generate useful debug messages at
366                                  * the very least. */
367 
368                                 q = path_is_mount_point(x, NULL, 0);
369                                 if (IN_SET(q, 0, -ENOENT)) {
370                                         /* Hmm, whaaaa? The mount point is not actually a mount point? Then
371                                          * it is either obstructed by a later mount or somebody has been
372                                          * racing against us and removed it. Either way the mount point
373                                          * doesn't matter to us, let's ignore it hence. */
374                                         log_debug_errno(r, "Mount point '%s' to remount is not a mount point anymore, ignoring remount failure: %m", x);
375                                         continue;
376                                 }
377                                 if (q < 0) /* Any other error on this? Just log and continue */
378                                         log_debug_errno(q, "Failed to determine whether '%s' is a mount point or not, ignoring: %m", x);
379 
380                                 if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) == 0) { /* ignore MS_RELATIME while comparing */
381                                         log_debug_errno(r, "Couldn't remount '%s', but the flags already match what we want, hence ignoring: %m", x);
382                                         continue;
383                                 }
384 
385                                 /* Make this fatal if this is the top-level mount */
386                                 if (path_equal(x, prefix))
387                                         return r;
388 
389                                 /* If this is not the top-level mount, then handle this gracefully: log but
390                                  * otherwise ignore. With NFS, FUSE, autofs there are just too many reasons
391                                  * this might fail without a chance for us to do anything about it, let's
392                                  * hence be strict on the top-level mount and lenient on the inner ones. */
393                                 log_debug_errno(r, "Couldn't remount submount '%s' for unexpected reason, ignoring: %m", x);
394                                 continue;
395                         }
396 
397                         log_debug("Remounted %s.", x);
398                 }
399         }
400 }
401 
bind_remount_one_with_mountinfo(const char * path,unsigned long new_flags,unsigned long flags_mask,FILE * proc_self_mountinfo)402 int bind_remount_one_with_mountinfo(
403                 const char *path,
404                 unsigned long new_flags,
405                 unsigned long flags_mask,
406                 FILE *proc_self_mountinfo) {
407 
408         _cleanup_(mnt_free_tablep) struct libmnt_table *table = NULL;
409         unsigned long flags = 0;
410         struct libmnt_fs *fs;
411         const char *opts;
412         int r;
413 
414         assert(path);
415         assert(proc_self_mountinfo);
416 
417         if ((flags_mask & ~MS_CONVERTIBLE_FLAGS) == 0 && !skip_mount_set_attr) {
418                 /* Let's take a shortcut for all the flags we know how to convert into mount_setattr() flags */
419 
420                 if (mount_setattr(AT_FDCWD, path, AT_SYMLINK_NOFOLLOW,
421                                   &(struct mount_attr) {
422                                           .attr_set = ms_flags_to_mount_attr(new_flags & flags_mask),
423                                           .attr_clr = ms_flags_to_mount_attr(~new_flags & flags_mask),
424                                   }, MOUNT_ATTR_SIZE_VER0) < 0) {
425 
426                         log_debug_errno(errno, "mount_setattr() didn't work, falling back to classic remounting: %m");
427 
428                         if (ERRNO_IS_NOT_SUPPORTED(errno)) /* if not supported, then don't bother at all anymore */
429                                 skip_mount_set_attr = true;
430                 } else
431                         return 0; /* Nice, this worked! */
432         }
433 
434         rewind(proc_self_mountinfo);
435 
436         table = mnt_new_table();
437         if (!table)
438                 return -ENOMEM;
439 
440         r = mnt_table_parse_stream(table, proc_self_mountinfo, "/proc/self/mountinfo");
441         if (r < 0)
442                 return r;
443 
444         fs = mnt_table_find_target(table, path, MNT_ITER_FORWARD);
445         if (!fs) {
446                 if (laccess(path, F_OK) < 0) /* Hmm, it's not in the mount table, but does it exist at all? */
447                         return -errno;
448 
449                 return -EINVAL; /* Not a mount point we recognize */
450         }
451 
452         opts = mnt_fs_get_vfs_options(fs);
453         if (opts) {
454                 r = mnt_optstr_get_flags(opts, &flags, mnt_get_builtin_optmap(MNT_LINUX_MAP));
455                 if (r < 0)
456                         log_debug_errno(r, "Could not get flags for '%s', ignoring: %m", path);
457         }
458 
459         r = mount_nofollow(NULL, path, NULL, ((flags & ~flags_mask)|MS_BIND|MS_REMOUNT|new_flags) & ~MS_RELATIME, NULL);
460         if (r < 0) {
461                 if (((flags ^ new_flags) & flags_mask & ~MS_RELATIME) != 0) /* Ignore MS_RELATIME again,
462                                                                              * since kernel adds it in
463                                                                              * everywhere, because it's the
464                                                                              * default. */
465                         return r;
466 
467                 /* Let's handle redundant remounts gracefully */
468                 log_debug_errno(r, "Failed to remount '%s' but flags already match what we want, ignoring: %m", path);
469         }
470 
471         return 0;
472 }
473 
mount_move_root(const char * path)474 int mount_move_root(const char *path) {
475         assert(path);
476 
477         if (chdir(path) < 0)
478                 return -errno;
479 
480         if (mount(path, "/", NULL, MS_MOVE, NULL) < 0)
481                 return -errno;
482 
483         if (chroot(".") < 0)
484                 return -errno;
485 
486         return RET_NERRNO(chdir("/"));
487 }
488 
repeat_unmount(const char * path,int flags)489 int repeat_unmount(const char *path, int flags) {
490         bool done = false;
491 
492         assert(path);
493 
494         /* If there are multiple mounts on a mount point, this
495          * removes them all */
496 
497         for (;;) {
498                 if (umount2(path, flags) < 0) {
499 
500                         if (errno == EINVAL)
501                                 return done;
502 
503                         return -errno;
504                 }
505 
506                 done = true;
507         }
508 }
509 
mode_to_inaccessible_node(const char * runtime_dir,mode_t mode,char ** ret)510 int mode_to_inaccessible_node(
511                 const char *runtime_dir,
512                 mode_t mode,
513                 char **ret) {
514 
515         /* This function maps a node type to a corresponding inaccessible file node. These nodes are created
516          * during early boot by PID 1. In some cases we lacked the privs to create the character and block
517          * devices (maybe because we run in an userns environment, or miss CAP_SYS_MKNOD, or run with a
518          * devices policy that excludes device nodes with major and minor of 0), but that's fine, in that
519          * case we use an AF_UNIX file node instead, which is not the same, but close enough for most
520          * uses. And most importantly, the kernel allows bind mounts from socket nodes to any non-directory
521          * file nodes, and that's the most important thing that matters.
522          *
523          * Note that the runtime directory argument shall be the top-level runtime directory, i.e. /run/ if
524          * we operate in system context and $XDG_RUNTIME_DIR if we operate in user context. */
525 
526         _cleanup_free_ char *d = NULL;
527         const char *node = NULL;
528 
529         assert(ret);
530 
531         if (!runtime_dir)
532                 runtime_dir = "/run";
533 
534         switch (mode & S_IFMT) {
535                 case S_IFREG:
536                         node = "/systemd/inaccessible/reg";
537                         break;
538 
539                 case S_IFDIR:
540                         node = "/systemd/inaccessible/dir";
541                         break;
542 
543                 case S_IFCHR:
544                         node = "/systemd/inaccessible/chr";
545                         break;
546 
547                 case S_IFBLK:
548                         node = "/systemd/inaccessible/blk";
549                         break;
550 
551                 case S_IFIFO:
552                         node = "/systemd/inaccessible/fifo";
553                         break;
554 
555                 case S_IFSOCK:
556                         node = "/systemd/inaccessible/sock";
557                         break;
558         }
559         if (!node)
560                 return -EINVAL;
561 
562         d = path_join(runtime_dir, node);
563         if (!d)
564                 return -ENOMEM;
565 
566         /* On new kernels unprivileged users are permitted to create 0:0 char device nodes (because they also
567          * act as whiteout inode for overlayfs), but no other char or block device nodes. On old kernels no
568          * device node whatsoever may be created by unprivileged processes. Hence, if the caller asks for the
569          * inaccessible block device node let's see if the block device node actually exists, and if not,
570          * fall back to the character device node. From there fall back to the socket device node. This means
571          * in the best case we'll get the right device node type — but if not we'll hopefully at least get a
572          * device node at all. */
573 
574         if (S_ISBLK(mode) &&
575             access(d, F_OK) < 0 && errno == ENOENT) {
576                 free(d);
577                 d = path_join(runtime_dir, "/systemd/inaccessible/chr");
578                 if (!d)
579                         return -ENOMEM;
580         }
581 
582         if (IN_SET(mode & S_IFMT, S_IFBLK, S_IFCHR) &&
583             access(d, F_OK) < 0 && errno == ENOENT) {
584                 free(d);
585                 d = path_join(runtime_dir, "/systemd/inaccessible/sock");
586                 if (!d)
587                         return -ENOMEM;
588         }
589 
590         *ret = TAKE_PTR(d);
591         return 0;
592 }
593 
mount_flags_to_string(unsigned long flags,char ** ret)594 int mount_flags_to_string(unsigned long flags, char **ret) {
595         static const struct {
596                 unsigned long flag;
597                 const char *name;
598         } map[] = {
599                 { .flag = MS_RDONLY,      .name = "MS_RDONLY",      },
600                 { .flag = MS_NOSUID,      .name = "MS_NOSUID",      },
601                 { .flag = MS_NODEV,       .name = "MS_NODEV",       },
602                 { .flag = MS_NOEXEC,      .name = "MS_NOEXEC",      },
603                 { .flag = MS_SYNCHRONOUS, .name = "MS_SYNCHRONOUS", },
604                 { .flag = MS_REMOUNT,     .name = "MS_REMOUNT",     },
605                 { .flag = MS_MANDLOCK,    .name = "MS_MANDLOCK",    },
606                 { .flag = MS_DIRSYNC,     .name = "MS_DIRSYNC",     },
607                 { .flag = MS_NOSYMFOLLOW, .name = "MS_NOSYMFOLLOW", },
608                 { .flag = MS_NOATIME,     .name = "MS_NOATIME",     },
609                 { .flag = MS_NODIRATIME,  .name = "MS_NODIRATIME",  },
610                 { .flag = MS_BIND,        .name = "MS_BIND",        },
611                 { .flag = MS_MOVE,        .name = "MS_MOVE",        },
612                 { .flag = MS_REC,         .name = "MS_REC",         },
613                 { .flag = MS_SILENT,      .name = "MS_SILENT",      },
614                 { .flag = MS_POSIXACL,    .name = "MS_POSIXACL",    },
615                 { .flag = MS_UNBINDABLE,  .name = "MS_UNBINDABLE",  },
616                 { .flag = MS_PRIVATE,     .name = "MS_PRIVATE",     },
617                 { .flag = MS_SLAVE,       .name = "MS_SLAVE",       },
618                 { .flag = MS_SHARED,      .name = "MS_SHARED",      },
619                 { .flag = MS_RELATIME,    .name = "MS_RELATIME",    },
620                 { .flag = MS_KERNMOUNT,   .name = "MS_KERNMOUNT",   },
621                 { .flag = MS_I_VERSION,   .name = "MS_I_VERSION",   },
622                 { .flag = MS_STRICTATIME, .name = "MS_STRICTATIME", },
623                 { .flag = MS_LAZYTIME,    .name = "MS_LAZYTIME",    },
624         };
625         _cleanup_free_ char *str = NULL;
626 
627         assert(ret);
628 
629         for (size_t i = 0; i < ELEMENTSOF(map); i++)
630                 if (flags & map[i].flag) {
631                         if (!strextend_with_separator(&str, "|", map[i].name))
632                                 return -ENOMEM;
633                         flags &= ~map[i].flag;
634                 }
635 
636         if (!str || flags != 0)
637                 if (strextendf_with_separator(&str, "|", "%lx", flags) < 0)
638                         return -ENOMEM;
639 
640         *ret = TAKE_PTR(str);
641         return 0;
642 }
643 
mount_verbose_full(int error_log_level,const char * what,const char * where,const char * type,unsigned long flags,const char * options,bool follow_symlink)644 int mount_verbose_full(
645                 int error_log_level,
646                 const char *what,
647                 const char *where,
648                 const char *type,
649                 unsigned long flags,
650                 const char *options,
651                 bool follow_symlink) {
652 
653         _cleanup_free_ char *fl = NULL, *o = NULL;
654         unsigned long f;
655         int r;
656 
657         r = mount_option_mangle(options, flags, &f, &o);
658         if (r < 0)
659                 return log_full_errno(error_log_level, r,
660                                       "Failed to mangle mount options %s: %m",
661                                       strempty(options));
662 
663         (void) mount_flags_to_string(f, &fl);
664 
665         if ((f & MS_REMOUNT) && !what && !type)
666                 log_debug("Remounting %s (%s \"%s\")...",
667                           where, strnull(fl), strempty(o));
668         else if (!what && !type)
669                 log_debug("Mounting %s (%s \"%s\")...",
670                           where, strnull(fl), strempty(o));
671         else if ((f & MS_BIND) && !type)
672                 log_debug("Bind-mounting %s on %s (%s \"%s\")...",
673                           what, where, strnull(fl), strempty(o));
674         else if (f & MS_MOVE)
675                 log_debug("Moving mount %s → %s (%s \"%s\")...",
676                           what, where, strnull(fl), strempty(o));
677         else
678                 log_debug("Mounting %s (%s) on %s (%s \"%s\")...",
679                           strna(what), strna(type), where, strnull(fl), strempty(o));
680 
681         if (follow_symlink)
682                 r = RET_NERRNO(mount(what, where, type, f, o));
683         else
684                 r = mount_nofollow(what, where, type, f, o);
685         if (r < 0)
686                 return log_full_errno(error_log_level, r,
687                                       "Failed to mount %s (type %s) on %s (%s \"%s\"): %m",
688                                       strna(what), strna(type), where, strnull(fl), strempty(o));
689         return 0;
690 }
691 
umount_verbose(int error_log_level,const char * what,int flags)692 int umount_verbose(
693                 int error_log_level,
694                 const char *what,
695                 int flags) {
696 
697         assert(what);
698 
699         log_debug("Umounting %s...", what);
700 
701         if (umount2(what, flags) < 0)
702                 return log_full_errno(error_log_level, errno,
703                                       "Failed to unmount %s: %m", what);
704 
705         return 0;
706 }
707 
mount_option_mangle(const char * options,unsigned long mount_flags,unsigned long * ret_mount_flags,char ** ret_remaining_options)708 int mount_option_mangle(
709                 const char *options,
710                 unsigned long mount_flags,
711                 unsigned long *ret_mount_flags,
712                 char **ret_remaining_options) {
713 
714         const struct libmnt_optmap *map;
715         _cleanup_free_ char *ret = NULL;
716         int r;
717 
718         /* This extracts mount flags from the mount options, and store
719          * non-mount-flag options to '*ret_remaining_options'.
720          * E.g.,
721          * "rw,nosuid,nodev,relatime,size=1630748k,mode=700,uid=1000,gid=1000"
722          * is split to MS_NOSUID|MS_NODEV|MS_RELATIME and
723          * "size=1630748k,mode=700,uid=1000,gid=1000".
724          * See more examples in test-mount-utils.c.
725          *
726          * Note that if 'options' does not contain any non-mount-flag options,
727          * then '*ret_remaining_options' is set to NULL instead of empty string.
728          * Note that this does not check validity of options stored in
729          * '*ret_remaining_options'.
730          * Note that if 'options' is NULL, then this just copies 'mount_flags'
731          * to '*ret_mount_flags'. */
732 
733         assert(ret_mount_flags);
734         assert(ret_remaining_options);
735 
736         map = mnt_get_builtin_optmap(MNT_LINUX_MAP);
737         if (!map)
738                 return -EINVAL;
739 
740         for (const char *p = options;;) {
741                 _cleanup_free_ char *word = NULL;
742                 const struct libmnt_optmap *ent;
743 
744                 r = extract_first_word(&p, &word, ",", EXTRACT_KEEP_QUOTE);
745                 if (r < 0)
746                         return r;
747                 if (r == 0)
748                         break;
749 
750                 for (ent = map; ent->name; ent++) {
751                         /* All entries in MNT_LINUX_MAP do not take any argument.
752                          * Thus, ent->name does not contain "=" or "[=]". */
753                         if (!streq(word, ent->name))
754                                 continue;
755 
756                         if (!(ent->mask & MNT_INVERT))
757                                 mount_flags |= ent->id;
758                         else if (mount_flags & ent->id)
759                                 mount_flags ^= ent->id;
760 
761                         break;
762                 }
763 
764                 /* If 'word' is not a mount flag, then store it in '*ret_remaining_options'. */
765                 if (!ent->name && !strextend_with_separator(&ret, ",", word))
766                         return -ENOMEM;
767         }
768 
769         *ret_mount_flags = mount_flags;
770         *ret_remaining_options = TAKE_PTR(ret);
771 
772         return 0;
773 }
774 
mount_in_namespace(pid_t target,const char * propagate_path,const char * incoming_path,const char * src,const char * dest,bool read_only,bool make_file_or_directory,const MountOptions * options,bool is_image)775 static int mount_in_namespace(
776                 pid_t target,
777                 const char *propagate_path,
778                 const char *incoming_path,
779                 const char *src,
780                 const char *dest,
781                 bool read_only,
782                 bool make_file_or_directory,
783                 const MountOptions *options,
784                 bool is_image) {
785 
786         _cleanup_close_pair_ int errno_pipe_fd[2] = { -1, -1 };
787         _cleanup_close_ int self_mntns_fd = -1, mntns_fd = -1, root_fd = -1, pidns_fd = -1, chased_src_fd = -1;
788         char mount_slave[] = "/tmp/propagate.XXXXXX", *mount_tmp, *mount_outside, *p;
789         bool mount_slave_created = false, mount_slave_mounted = false,
790                 mount_tmp_created = false, mount_tmp_mounted = false,
791                 mount_outside_created = false, mount_outside_mounted = false;
792         _cleanup_free_ char *chased_src_path = NULL;
793         struct stat st, self_mntns_st;
794         pid_t child;
795         int r;
796 
797         assert(target > 0);
798         assert(propagate_path);
799         assert(incoming_path);
800         assert(src);
801         assert(dest);
802         assert(!options || is_image);
803 
804         r = namespace_open(target, &pidns_fd, &mntns_fd, NULL, NULL, &root_fd);
805         if (r < 0)
806                 return log_debug_errno(r, "Failed to retrieve FDs of the target process' namespace: %m");
807 
808         if (fstat(mntns_fd, &st) < 0)
809                 return log_debug_errno(errno, "Failed to fstat mount namespace FD of target process: %m");
810 
811         r = namespace_open(0, NULL, &self_mntns_fd, NULL, NULL, NULL);
812         if (r < 0)
813                 return log_debug_errno(r, "Failed to retrieve FDs of systemd's namespace: %m");
814 
815         if (fstat(self_mntns_fd, &self_mntns_st) < 0)
816                 return log_debug_errno(errno, "Failed to fstat mount namespace FD of systemd: %m");
817 
818         /* We can't add new mounts at runtime if the process wasn't started in a namespace */
819         if (stat_inode_same(&st, &self_mntns_st))
820                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Failed to activate bind mount in target, not running in a mount namespace");
821 
822         /* One day, when bind mounting /proc/self/fd/n works across namespace boundaries we should rework
823          * this logic to make use of it... */
824 
825         p = strjoina(propagate_path, "/");
826         r = laccess(p, F_OK);
827         if (r < 0)
828                 return log_debug_errno(r == -ENOENT ? SYNTHETIC_ERRNO(EOPNOTSUPP) : r, "Target does not allow propagation of mount points");
829 
830         r = chase_symlinks(src, NULL, 0, &chased_src_path, &chased_src_fd);
831         if (r < 0)
832                 return log_debug_errno(r, "Failed to resolve source path of %s: %m", src);
833         log_debug("Chased source path of %s to %s", src, chased_src_path);
834 
835         if (fstat(chased_src_fd, &st) < 0)
836                 return log_debug_errno(errno, "Failed to stat() resolved source path %s: %m", src);
837         if (S_ISLNK(st.st_mode)) /* This shouldn't really happen, given that we just chased the symlinks above, but let's better be safe… */
838                 return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "Source directory %s can't be a symbolic link", src);
839 
840         /* Our goal is to install a new bind mount into the container,
841            possibly read-only. This is irritatingly complex
842            unfortunately, currently.
843 
844            First, we start by creating a private playground in /tmp,
845            that we can mount MS_SLAVE. (Which is necessary, since
846            MS_MOVE cannot be applied to mounts with MS_SHARED parent
847            mounts.) */
848 
849         if (!mkdtemp(mount_slave))
850                 return log_debug_errno(errno, "Failed to create playground %s: %m", mount_slave);
851 
852         mount_slave_created = true;
853 
854         r = mount_nofollow_verbose(LOG_DEBUG, mount_slave, mount_slave, NULL, MS_BIND, NULL);
855         if (r < 0)
856                 goto finish;
857 
858         mount_slave_mounted = true;
859 
860         r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_slave, NULL, MS_SLAVE, NULL);
861         if (r < 0)
862                 goto finish;
863 
864         /* Second, we mount the source file or directory to a directory inside of our MS_SLAVE playground. */
865         mount_tmp = strjoina(mount_slave, "/mount");
866         if (is_image)
867                 r = mkdir_p(mount_tmp, 0700);
868         else
869                 r = make_mount_point_inode_from_stat(&st, mount_tmp, 0700);
870         if (r < 0) {
871                 log_debug_errno(r, "Failed to create temporary mount point %s: %m", mount_tmp);
872                 goto finish;
873         }
874 
875         mount_tmp_created = true;
876 
877         if (is_image)
878                 r = verity_dissect_and_mount(chased_src_fd, chased_src_path, mount_tmp, options, NULL, NULL, NULL, NULL);
879         else
880                 r = mount_follow_verbose(LOG_DEBUG, FORMAT_PROC_FD_PATH(chased_src_fd), mount_tmp, NULL, MS_BIND, NULL);
881         if (r < 0)
882                 goto finish;
883 
884         mount_tmp_mounted = true;
885 
886         /* Third, we remount the new bind mount read-only if requested. */
887         if (read_only) {
888                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, mount_tmp, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY, NULL);
889                 if (r < 0)
890                         goto finish;
891         }
892 
893         /* Fourth, we move the new bind mount into the propagation directory. This way it will appear there read-only
894          * right-away. */
895 
896         mount_outside = strjoina(propagate_path, "/XXXXXX");
897         if (is_image || S_ISDIR(st.st_mode))
898                 r = mkdtemp(mount_outside) ? 0 : -errno;
899         else {
900                 r = mkostemp_safe(mount_outside);
901                 safe_close(r);
902         }
903         if (r < 0) {
904                 log_debug_errno(r, "Cannot create propagation file or directory %s: %m", mount_outside);
905                 goto finish;
906         }
907 
908         mount_outside_created = true;
909 
910         r = mount_nofollow_verbose(LOG_DEBUG, mount_tmp, mount_outside, NULL, MS_MOVE, NULL);
911         if (r < 0)
912                 goto finish;
913 
914         mount_outside_mounted = true;
915         mount_tmp_mounted = false;
916 
917         if (is_image || S_ISDIR(st.st_mode))
918                 (void) rmdir(mount_tmp);
919         else
920                 (void) unlink(mount_tmp);
921         mount_tmp_created = false;
922 
923         (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
924         mount_slave_mounted = false;
925 
926         (void) rmdir(mount_slave);
927         mount_slave_created = false;
928 
929         if (pipe2(errno_pipe_fd, O_CLOEXEC|O_NONBLOCK) < 0) {
930                 log_debug_errno(errno, "Failed to create pipe: %m");
931                 goto finish;
932         }
933 
934         r = namespace_fork("(sd-bindmnt)", "(sd-bindmnt-inner)", NULL, 0, FORK_RESET_SIGNALS|FORK_DEATHSIG,
935                            pidns_fd, mntns_fd, -1, -1, root_fd, &child);
936         if (r < 0)
937                 goto finish;
938         if (r == 0) {
939                 const char *mount_inside;
940 
941                 errno_pipe_fd[0] = safe_close(errno_pipe_fd[0]);
942 
943                 if (make_file_or_directory) {
944                         if (!is_image) {
945                                 (void) mkdir_parents(dest, 0755);
946                                 (void) make_mount_point_inode_from_stat(&st, dest, 0700);
947                         } else
948                                 (void) mkdir_p(dest, 0755);
949                 }
950 
951                 /* Fifth, move the mount to the right place inside */
952                 mount_inside = strjoina(incoming_path, basename(mount_outside));
953                 r = mount_nofollow_verbose(LOG_ERR, mount_inside, dest, NULL, MS_MOVE, NULL);
954                 if (r < 0)
955                         goto child_fail;
956 
957                 _exit(EXIT_SUCCESS);
958 
959         child_fail:
960                 (void) write(errno_pipe_fd[1], &r, sizeof(r));
961                 errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
962 
963                 _exit(EXIT_FAILURE);
964         }
965 
966         errno_pipe_fd[1] = safe_close(errno_pipe_fd[1]);
967 
968         r = wait_for_terminate_and_check("(sd-bindmnt)", child, 0);
969         if (r < 0) {
970                 log_debug_errno(r, "Failed to wait for child: %m");
971                 goto finish;
972         }
973         if (r != EXIT_SUCCESS) {
974                 if (read(errno_pipe_fd[0], &r, sizeof(r)) == sizeof(r))
975                         log_debug_errno(r, "Failed to mount: %m");
976                 else
977                         log_debug("Child failed.");
978                 goto finish;
979         }
980 
981 finish:
982         if (mount_outside_mounted)
983                 (void) umount_verbose(LOG_DEBUG, mount_outside, UMOUNT_NOFOLLOW);
984         if (mount_outside_created) {
985                 if (is_image || S_ISDIR(st.st_mode))
986                         (void) rmdir(mount_outside);
987                 else
988                         (void) unlink(mount_outside);
989         }
990 
991         if (mount_tmp_mounted)
992                 (void) umount_verbose(LOG_DEBUG, mount_tmp, UMOUNT_NOFOLLOW);
993         if (mount_tmp_created) {
994                 if (is_image || S_ISDIR(st.st_mode))
995                         (void) rmdir(mount_tmp);
996                 else
997                         (void) unlink(mount_tmp);
998         }
999 
1000         if (mount_slave_mounted)
1001                 (void) umount_verbose(LOG_DEBUG, mount_slave, UMOUNT_NOFOLLOW);
1002         if (mount_slave_created)
1003                 (void) rmdir(mount_slave);
1004 
1005         return r;
1006 }
1007 
bind_mount_in_namespace(pid_t target,const char * propagate_path,const char * incoming_path,const char * src,const char * dest,bool read_only,bool make_file_or_directory)1008 int bind_mount_in_namespace(
1009                 pid_t target,
1010                 const char *propagate_path,
1011                 const char *incoming_path,
1012                 const char *src,
1013                 const char *dest,
1014                 bool read_only,
1015                 bool make_file_or_directory) {
1016 
1017         return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, NULL, false);
1018 }
1019 
mount_image_in_namespace(pid_t target,const char * propagate_path,const char * incoming_path,const char * src,const char * dest,bool read_only,bool make_file_or_directory,const MountOptions * options)1020 int mount_image_in_namespace(
1021                 pid_t target,
1022                 const char *propagate_path,
1023                 const char *incoming_path,
1024                 const char *src,
1025                 const char *dest,
1026                 bool read_only,
1027                 bool make_file_or_directory,
1028                 const MountOptions *options) {
1029 
1030         return mount_in_namespace(target, propagate_path, incoming_path, src, dest, read_only, make_file_or_directory, options, true);
1031 }
1032 
make_mount_point(const char * path)1033 int make_mount_point(const char *path) {
1034         int r;
1035 
1036         assert(path);
1037 
1038         /* If 'path' is already a mount point, does nothing and returns 0. If it is not it makes it one, and returns 1. */
1039 
1040         r = path_is_mount_point(path, NULL, 0);
1041         if (r < 0)
1042                 return log_debug_errno(r, "Failed to determine whether '%s' is a mount point: %m", path);
1043         if (r > 0)
1044                 return 0;
1045 
1046         r = mount_nofollow_verbose(LOG_DEBUG, path, path, NULL, MS_BIND|MS_REC, NULL);
1047         if (r < 0)
1048                 return r;
1049 
1050         return 1;
1051 }
1052 
make_userns(uid_t uid_shift,uid_t uid_range,RemountIdmapFlags flags)1053 static int make_userns(uid_t uid_shift, uid_t uid_range, RemountIdmapFlags flags) {
1054         _cleanup_close_ int userns_fd = -1;
1055         _cleanup_free_ char *line = NULL;
1056 
1057         /* Allocates a userns file descriptor with the mapping we need. For this we'll fork off a child
1058          * process whose only purpose is to give us a new user namespace. It's killed when we got it. */
1059 
1060         if (asprintf(&line, UID_FMT " " UID_FMT " " UID_FMT "\n", 0, uid_shift, uid_range) < 0)
1061                 return log_oom_debug();
1062 
1063         /* If requested we'll include an entry in the mapping so that the host root user can make changes to
1064          * the uidmapped mount like it normally would. Specifically, we'll map the user with UID_HOST_ROOT on
1065          * the backing fs to UID 0. This is useful, since nspawn code wants to create various missing inodes
1066          * in the OS tree before booting into it, and this becomes very easy and straightforward to do if it
1067          * can just do it under its own regular UID. Note that in that case the container's runtime uidmap
1068          * (i.e. the one the container payload processes run in) will leave this UID unmapped, i.e. if we
1069          * accidentally leave files owned by host root in the already uidmapped tree around they'll show up
1070          * as owned by 'nobody', which is safe. (Of course, we shouldn't leave such inodes around, but always
1071          * chown() them to the container's own UID range, but it's good to have a safety net, in case we
1072          * forget it.) */
1073         if (flags & REMOUNT_IDMAP_HOST_ROOT)
1074                 if (strextendf(&line,
1075                                UID_FMT " " UID_FMT " " UID_FMT "\n",
1076                                UID_MAPPED_ROOT, 0, 1) < 0)
1077                         return log_oom_debug();
1078 
1079         /* We always assign the same UID and GID ranges */
1080         userns_fd = userns_acquire(line, line);
1081         if (userns_fd < 0)
1082                 return log_debug_errno(userns_fd, "Failed to acquire new userns: %m");
1083 
1084         return TAKE_FD(userns_fd);
1085 }
1086 
remount_idmap(const char * p,uid_t uid_shift,uid_t uid_range,RemountIdmapFlags flags)1087 int remount_idmap(
1088                 const char *p,
1089                 uid_t uid_shift,
1090                 uid_t uid_range,
1091                 RemountIdmapFlags flags) {
1092 
1093         _cleanup_close_ int mount_fd = -1, userns_fd = -1;
1094         int r;
1095 
1096         assert(p);
1097 
1098         if (!userns_shift_range_valid(uid_shift, uid_range))
1099                 return -EINVAL;
1100 
1101         /* Clone the mount point */
1102         mount_fd = open_tree(-1, p, OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC);
1103         if (mount_fd < 0)
1104                 return log_debug_errno(errno, "Failed to open tree of mounted filesystem '%s': %m", p);
1105 
1106         /* Create a user namespace mapping */
1107         userns_fd = make_userns(uid_shift, uid_range, flags);
1108         if (userns_fd < 0)
1109                 return userns_fd;
1110 
1111         /* Set the user namespace mapping attribute on the cloned mount point */
1112         if (mount_setattr(mount_fd, "", AT_EMPTY_PATH | AT_RECURSIVE,
1113                           &(struct mount_attr) {
1114                                   .attr_set = MOUNT_ATTR_IDMAP,
1115                                   .userns_fd = userns_fd,
1116                           }, sizeof(struct mount_attr)) < 0)
1117                 return log_debug_errno(errno, "Failed to change bind mount attributes for '%s': %m", p);
1118 
1119         /* Remove the old mount point */
1120         r = umount_verbose(LOG_DEBUG, p, UMOUNT_NOFOLLOW);
1121         if (r < 0)
1122                 return r;
1123 
1124         /* And place the cloned version in its place */
1125         if (move_mount(mount_fd, "", -1, p, MOVE_MOUNT_F_EMPTY_PATH) < 0)
1126                 return log_debug_errno(errno, "Failed to attach UID mapped mount to '%s': %m", p);
1127 
1128         return 0;
1129 }
1130 
make_mount_point_inode_from_stat(const struct stat * st,const char * dest,mode_t mode)1131 int make_mount_point_inode_from_stat(const struct stat *st, const char *dest, mode_t mode) {
1132         assert(st);
1133         assert(dest);
1134 
1135         if (S_ISDIR(st->st_mode))
1136                 return mkdir_label(dest, mode);
1137         else
1138                 return mknod(dest, S_IFREG|(mode & ~0111), 0);
1139 }
1140 
make_mount_point_inode_from_path(const char * source,const char * dest,mode_t mode)1141 int make_mount_point_inode_from_path(const char *source, const char *dest, mode_t mode) {
1142         struct stat st;
1143 
1144         assert(source);
1145         assert(dest);
1146 
1147         if (stat(source, &st) < 0)
1148                 return -errno;
1149 
1150         return make_mount_point_inode_from_stat(&st, dest, mode);
1151 }
1152