1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17 
18 #if HAVE_PAM
19 #include <security/pam_appl.h>
20 #endif
21 
22 #if HAVE_SELINUX
23 #include <selinux/selinux.h>
24 #endif
25 
26 #if HAVE_SECCOMP
27 #include <seccomp.h>
28 #endif
29 
30 #if HAVE_APPARMOR
31 #include <sys/apparmor.h>
32 #endif
33 
34 #include "sd-messages.h"
35 
36 #include "acl-util.h"
37 #include "af-list.h"
38 #include "alloc-util.h"
39 #if HAVE_APPARMOR
40 #include "apparmor-util.h"
41 #endif
42 #include "async.h"
43 #include "barrier.h"
44 #include "bpf-lsm.h"
45 #include "cap-list.h"
46 #include "capability-util.h"
47 #include "cgroup-setup.h"
48 #include "chase-symlinks.h"
49 #include "chown-recursive.h"
50 #include "cpu-set-util.h"
51 #include "creds-util.h"
52 #include "data-fd-util.h"
53 #include "def.h"
54 #include "env-file.h"
55 #include "env-util.h"
56 #include "errno-list.h"
57 #include "escape.h"
58 #include "execute.h"
59 #include "exit-status.h"
60 #include "fd-util.h"
61 #include "fileio.h"
62 #include "format-util.h"
63 #include "glob-util.h"
64 #include "hexdecoct.h"
65 #include "io-util.h"
66 #include "ioprio-util.h"
67 #include "label.h"
68 #include "log.h"
69 #include "macro.h"
70 #include "manager.h"
71 #include "manager-dump.h"
72 #include "memory-util.h"
73 #include "missing_fs.h"
74 #include "missing_ioprio.h"
75 #include "mkdir-label.h"
76 #include "mount-util.h"
77 #include "mountpoint-util.h"
78 #include "namespace.h"
79 #include "parse-util.h"
80 #include "path-util.h"
81 #include "process-util.h"
82 #include "random-util.h"
83 #include "recurse-dir.h"
84 #include "rlimit-util.h"
85 #include "rm-rf.h"
86 #if HAVE_SECCOMP
87 #include "seccomp-util.h"
88 #endif
89 #include "securebits-util.h"
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "smack-util.h"
93 #include "socket-util.h"
94 #include "special.h"
95 #include "stat-util.h"
96 #include "string-table.h"
97 #include "string-util.h"
98 #include "strv.h"
99 #include "syslog-util.h"
100 #include "terminal-util.h"
101 #include "tmpfile-util.h"
102 #include "umask-util.h"
103 #include "unit-serialize.h"
104 #include "user-util.h"
105 #include "utmp-wtmp.h"
106 
107 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
108 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
109 
110 #define SNDBUF_SIZE (8*1024*1024)
111 
shift_fds(int fds[],size_t n_fds)112 static int shift_fds(int fds[], size_t n_fds) {
113         if (n_fds <= 0)
114                 return 0;
115 
116         /* Modifies the fds array! (sorts it) */
117 
118         assert(fds);
119 
120         for (int start = 0;;) {
121                 int restart_from = -1;
122 
123                 for (int i = start; i < (int) n_fds; i++) {
124                         int nfd;
125 
126                         /* Already at right index? */
127                         if (fds[i] == i+3)
128                                 continue;
129 
130                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
131                         if (nfd < 0)
132                                 return -errno;
133 
134                         safe_close(fds[i]);
135                         fds[i] = nfd;
136 
137                         /* Hmm, the fd we wanted isn't free? Then
138                          * let's remember that and try again from here */
139                         if (nfd != i+3 && restart_from < 0)
140                                 restart_from = i;
141                 }
142 
143                 if (restart_from < 0)
144                         break;
145 
146                 start = restart_from;
147         }
148 
149         return 0;
150 }
151 
flags_fds(const int fds[],size_t n_socket_fds,size_t n_storage_fds,bool nonblock)152 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
153         size_t n_fds;
154         int r;
155 
156         n_fds = n_socket_fds + n_storage_fds;
157         if (n_fds <= 0)
158                 return 0;
159 
160         assert(fds);
161 
162         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163          * O_NONBLOCK only applies to socket activation though. */
164 
165         for (size_t i = 0; i < n_fds; i++) {
166 
167                 if (i < n_socket_fds) {
168                         r = fd_nonblock(fds[i], nonblock);
169                         if (r < 0)
170                                 return r;
171                 }
172 
173                 /* We unconditionally drop FD_CLOEXEC from the fds,
174                  * since after all we want to pass these fds to our
175                  * children */
176 
177                 r = fd_cloexec(fds[i], false);
178                 if (r < 0)
179                         return r;
180         }
181 
182         return 0;
183 }
184 
exec_context_tty_path(const ExecContext * context)185 static const char *exec_context_tty_path(const ExecContext *context) {
186         assert(context);
187 
188         if (context->stdio_as_fds)
189                 return NULL;
190 
191         if (context->tty_path)
192                 return context->tty_path;
193 
194         return "/dev/console";
195 }
196 
exec_context_tty_reset(const ExecContext * context,const ExecParameters * p)197 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
198         const char *path;
199 
200         assert(context);
201 
202         path = exec_context_tty_path(context);
203 
204         if (context->tty_vhangup) {
205                 if (p && p->stdin_fd >= 0)
206                         (void) terminal_vhangup_fd(p->stdin_fd);
207                 else if (path)
208                         (void) terminal_vhangup(path);
209         }
210 
211         if (context->tty_reset) {
212                 if (p && p->stdin_fd >= 0)
213                         (void) reset_terminal_fd(p->stdin_fd, true);
214                 else if (path)
215                         (void) reset_terminal(path);
216         }
217 
218         if (p && p->stdin_fd >= 0)
219                 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
220 
221         if (context->tty_vt_disallocate && path)
222                 (void) vt_disallocate(path);
223 }
224 
is_terminal_input(ExecInput i)225 static bool is_terminal_input(ExecInput i) {
226         return IN_SET(i,
227                       EXEC_INPUT_TTY,
228                       EXEC_INPUT_TTY_FORCE,
229                       EXEC_INPUT_TTY_FAIL);
230 }
231 
is_terminal_output(ExecOutput o)232 static bool is_terminal_output(ExecOutput o) {
233         return IN_SET(o,
234                       EXEC_OUTPUT_TTY,
235                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
236                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
237 }
238 
is_kmsg_output(ExecOutput o)239 static bool is_kmsg_output(ExecOutput o) {
240         return IN_SET(o,
241                       EXEC_OUTPUT_KMSG,
242                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
243 }
244 
exec_context_needs_term(const ExecContext * c)245 static bool exec_context_needs_term(const ExecContext *c) {
246         assert(c);
247 
248         /* Return true if the execution context suggests we should set $TERM to something useful. */
249 
250         if (is_terminal_input(c->std_input))
251                 return true;
252 
253         if (is_terminal_output(c->std_output))
254                 return true;
255 
256         if (is_terminal_output(c->std_error))
257                 return true;
258 
259         return !!c->tty_path;
260 }
261 
open_null_as(int flags,int nfd)262 static int open_null_as(int flags, int nfd) {
263         int fd;
264 
265         assert(nfd >= 0);
266 
267         fd = open("/dev/null", flags|O_NOCTTY);
268         if (fd < 0)
269                 return -errno;
270 
271         return move_fd(fd, nfd, false);
272 }
273 
connect_journal_socket(int fd,const char * log_namespace,uid_t uid,gid_t gid)274 static int connect_journal_socket(
275                 int fd,
276                 const char *log_namespace,
277                 uid_t uid,
278                 gid_t gid) {
279 
280         union sockaddr_union sa;
281         socklen_t sa_len;
282         uid_t olduid = UID_INVALID;
283         gid_t oldgid = GID_INVALID;
284         const char *j;
285         int r;
286 
287         j = log_namespace ?
288                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
289                 "/run/systemd/journal/stdout";
290         r = sockaddr_un_set_path(&sa.un, j);
291         if (r < 0)
292                 return r;
293         sa_len = r;
294 
295         if (gid_is_valid(gid)) {
296                 oldgid = getgid();
297 
298                 if (setegid(gid) < 0)
299                         return -errno;
300         }
301 
302         if (uid_is_valid(uid)) {
303                 olduid = getuid();
304 
305                 if (seteuid(uid) < 0) {
306                         r = -errno;
307                         goto restore_gid;
308                 }
309         }
310 
311         r = RET_NERRNO(connect(fd, &sa.sa, sa_len));
312 
313         /* If we fail to restore the uid or gid, things will likely
314            fail later on. This should only happen if an LSM interferes. */
315 
316         if (uid_is_valid(uid))
317                 (void) seteuid(olduid);
318 
319  restore_gid:
320         if (gid_is_valid(gid))
321                 (void) setegid(oldgid);
322 
323         return r;
324 }
325 
connect_logger_as(const Unit * unit,const ExecContext * context,const ExecParameters * params,ExecOutput output,const char * ident,int nfd,uid_t uid,gid_t gid)326 static int connect_logger_as(
327                 const Unit *unit,
328                 const ExecContext *context,
329                 const ExecParameters *params,
330                 ExecOutput output,
331                 const char *ident,
332                 int nfd,
333                 uid_t uid,
334                 gid_t gid) {
335 
336         _cleanup_close_ int fd = -1;
337         int r;
338 
339         assert(context);
340         assert(params);
341         assert(output < _EXEC_OUTPUT_MAX);
342         assert(ident);
343         assert(nfd >= 0);
344 
345         fd = socket(AF_UNIX, SOCK_STREAM, 0);
346         if (fd < 0)
347                 return -errno;
348 
349         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
350         if (r < 0)
351                 return r;
352 
353         if (shutdown(fd, SHUT_RD) < 0)
354                 return -errno;
355 
356         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
357 
358         if (dprintf(fd,
359                 "%s\n"
360                 "%s\n"
361                 "%i\n"
362                 "%i\n"
363                 "%i\n"
364                 "%i\n"
365                 "%i\n",
366                 context->syslog_identifier ?: ident,
367                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
368                 context->syslog_priority,
369                 !!context->syslog_level_prefix,
370                 false,
371                 is_kmsg_output(output),
372                 is_terminal_output(output)) < 0)
373                 return -errno;
374 
375         return move_fd(TAKE_FD(fd), nfd, false);
376 }
377 
open_terminal_as(const char * path,int flags,int nfd)378 static int open_terminal_as(const char *path, int flags, int nfd) {
379         int fd;
380 
381         assert(path);
382         assert(nfd >= 0);
383 
384         fd = open_terminal(path, flags | O_NOCTTY);
385         if (fd < 0)
386                 return fd;
387 
388         return move_fd(fd, nfd, false);
389 }
390 
acquire_path(const char * path,int flags,mode_t mode)391 static int acquire_path(const char *path, int flags, mode_t mode) {
392         union sockaddr_union sa;
393         socklen_t sa_len;
394         _cleanup_close_ int fd = -1;
395         int r;
396 
397         assert(path);
398 
399         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
400                 flags |= O_CREAT;
401 
402         fd = open(path, flags|O_NOCTTY, mode);
403         if (fd >= 0)
404                 return TAKE_FD(fd);
405 
406         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
407                 return -errno;
408 
409         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
410 
411         r = sockaddr_un_set_path(&sa.un, path);
412         if (r < 0)
413                 return r == -EINVAL ? -ENXIO : r;
414         sa_len = r;
415 
416         fd = socket(AF_UNIX, SOCK_STREAM, 0);
417         if (fd < 0)
418                 return -errno;
419 
420         if (connect(fd, &sa.sa, sa_len) < 0)
421                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
422                                                            * indication that this wasn't an AF_UNIX socket after all */
423 
424         if ((flags & O_ACCMODE) == O_RDONLY)
425                 r = shutdown(fd, SHUT_WR);
426         else if ((flags & O_ACCMODE) == O_WRONLY)
427                 r = shutdown(fd, SHUT_RD);
428         else
429                 r = 0;
430         if (r < 0)
431                 return -errno;
432 
433         return TAKE_FD(fd);
434 }
435 
fixup_input(const ExecContext * context,int socket_fd,bool apply_tty_stdin)436 static int fixup_input(
437                 const ExecContext *context,
438                 int socket_fd,
439                 bool apply_tty_stdin) {
440 
441         ExecInput std_input;
442 
443         assert(context);
444 
445         std_input = context->std_input;
446 
447         if (is_terminal_input(std_input) && !apply_tty_stdin)
448                 return EXEC_INPUT_NULL;
449 
450         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
451                 return EXEC_INPUT_NULL;
452 
453         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
454                 return EXEC_INPUT_NULL;
455 
456         return std_input;
457 }
458 
fixup_output(ExecOutput output,int socket_fd)459 static int fixup_output(ExecOutput output, int socket_fd) {
460 
461         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
462                 return EXEC_OUTPUT_INHERIT;
463 
464         return output;
465 }
466 
setup_input(const ExecContext * context,const ExecParameters * params,int socket_fd,const int named_iofds[static3])467 static int setup_input(
468                 const ExecContext *context,
469                 const ExecParameters *params,
470                 int socket_fd,
471                 const int named_iofds[static 3]) {
472 
473         ExecInput i;
474         int r;
475 
476         assert(context);
477         assert(params);
478         assert(named_iofds);
479 
480         if (params->stdin_fd >= 0) {
481                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
482                         return -errno;
483 
484                 /* Try to make this the controlling tty, if it is a tty, and reset it */
485                 if (isatty(STDIN_FILENO)) {
486                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
487                         (void) reset_terminal_fd(STDIN_FILENO, true);
488                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
489                 }
490 
491                 return STDIN_FILENO;
492         }
493 
494         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
495 
496         switch (i) {
497 
498         case EXEC_INPUT_NULL:
499                 return open_null_as(O_RDONLY, STDIN_FILENO);
500 
501         case EXEC_INPUT_TTY:
502         case EXEC_INPUT_TTY_FORCE:
503         case EXEC_INPUT_TTY_FAIL: {
504                 int fd;
505 
506                 fd = acquire_terminal(exec_context_tty_path(context),
507                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
508                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
509                                                                   ACQUIRE_TERMINAL_WAIT,
510                                       USEC_INFINITY);
511                 if (fd < 0)
512                         return fd;
513 
514                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
515                 if (r < 0)
516                         return r;
517 
518                 return move_fd(fd, STDIN_FILENO, false);
519         }
520 
521         case EXEC_INPUT_SOCKET:
522                 assert(socket_fd >= 0);
523 
524                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
525 
526         case EXEC_INPUT_NAMED_FD:
527                 assert(named_iofds[STDIN_FILENO] >= 0);
528 
529                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
530                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
531 
532         case EXEC_INPUT_DATA: {
533                 int fd;
534 
535                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
536                 if (fd < 0)
537                         return fd;
538 
539                 return move_fd(fd, STDIN_FILENO, false);
540         }
541 
542         case EXEC_INPUT_FILE: {
543                 bool rw;
544                 int fd;
545 
546                 assert(context->stdio_file[STDIN_FILENO]);
547 
548                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
549                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
550 
551                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
552                 if (fd < 0)
553                         return fd;
554 
555                 return move_fd(fd, STDIN_FILENO, false);
556         }
557 
558         default:
559                 assert_not_reached();
560         }
561 }
562 
can_inherit_stderr_from_stdout(const ExecContext * context,ExecOutput o,ExecOutput e)563 static bool can_inherit_stderr_from_stdout(
564                 const ExecContext *context,
565                 ExecOutput o,
566                 ExecOutput e) {
567 
568         assert(context);
569 
570         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
571          * stderr fd */
572 
573         if (e == EXEC_OUTPUT_INHERIT)
574                 return true;
575         if (e != o)
576                 return false;
577 
578         if (e == EXEC_OUTPUT_NAMED_FD)
579                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
580 
581         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
582                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
583 
584         return true;
585 }
586 
setup_output(const Unit * unit,const ExecContext * context,const ExecParameters * params,int fileno,int socket_fd,const int named_iofds[static3],const char * ident,uid_t uid,gid_t gid,dev_t * journal_stream_dev,ino_t * journal_stream_ino)587 static int setup_output(
588                 const Unit *unit,
589                 const ExecContext *context,
590                 const ExecParameters *params,
591                 int fileno,
592                 int socket_fd,
593                 const int named_iofds[static 3],
594                 const char *ident,
595                 uid_t uid,
596                 gid_t gid,
597                 dev_t *journal_stream_dev,
598                 ino_t *journal_stream_ino) {
599 
600         ExecOutput o;
601         ExecInput i;
602         int r;
603 
604         assert(unit);
605         assert(context);
606         assert(params);
607         assert(ident);
608         assert(journal_stream_dev);
609         assert(journal_stream_ino);
610 
611         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
612 
613                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
614                         return -errno;
615 
616                 return STDOUT_FILENO;
617         }
618 
619         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
620                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
621                         return -errno;
622 
623                 return STDERR_FILENO;
624         }
625 
626         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
627         o = fixup_output(context->std_output, socket_fd);
628 
629         if (fileno == STDERR_FILENO) {
630                 ExecOutput e;
631                 e = fixup_output(context->std_error, socket_fd);
632 
633                 /* This expects the input and output are already set up */
634 
635                 /* Don't change the stderr file descriptor if we inherit all
636                  * the way and are not on a tty */
637                 if (e == EXEC_OUTPUT_INHERIT &&
638                     o == EXEC_OUTPUT_INHERIT &&
639                     i == EXEC_INPUT_NULL &&
640                     !is_terminal_input(context->std_input) &&
641                     getppid() != 1)
642                         return fileno;
643 
644                 /* Duplicate from stdout if possible */
645                 if (can_inherit_stderr_from_stdout(context, o, e))
646                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
647 
648                 o = e;
649 
650         } else if (o == EXEC_OUTPUT_INHERIT) {
651                 /* If input got downgraded, inherit the original value */
652                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
653                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
654 
655                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
656                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
657                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
658 
659                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
660                 if (getppid() != 1)
661                         return fileno;
662 
663                 /* We need to open /dev/null here anew, to get the right access mode. */
664                 return open_null_as(O_WRONLY, fileno);
665         }
666 
667         switch (o) {
668 
669         case EXEC_OUTPUT_NULL:
670                 return open_null_as(O_WRONLY, fileno);
671 
672         case EXEC_OUTPUT_TTY:
673                 if (is_terminal_input(i))
674                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
675 
676                 /* We don't reset the terminal if this is just about output */
677                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
678 
679         case EXEC_OUTPUT_KMSG:
680         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
681         case EXEC_OUTPUT_JOURNAL:
682         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
683                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
684                 if (r < 0) {
685                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
686                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
687                         r = open_null_as(O_WRONLY, fileno);
688                 } else {
689                         struct stat st;
690 
691                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
692                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
693                          * services to detect whether they are connected to the journal or not.
694                          *
695                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
696                          * about STDERR as that's usually the best way to do logging. */
697 
698                         if (fstat(fileno, &st) >= 0 &&
699                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
700                                 *journal_stream_dev = st.st_dev;
701                                 *journal_stream_ino = st.st_ino;
702                         }
703                 }
704                 return r;
705 
706         case EXEC_OUTPUT_SOCKET:
707                 assert(socket_fd >= 0);
708 
709                 return RET_NERRNO(dup2(socket_fd, fileno));
710 
711         case EXEC_OUTPUT_NAMED_FD:
712                 assert(named_iofds[fileno] >= 0);
713 
714                 (void) fd_nonblock(named_iofds[fileno], false);
715                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
716 
717         case EXEC_OUTPUT_FILE:
718         case EXEC_OUTPUT_FILE_APPEND:
719         case EXEC_OUTPUT_FILE_TRUNCATE: {
720                 bool rw;
721                 int fd, flags;
722 
723                 assert(context->stdio_file[fileno]);
724 
725                 rw = context->std_input == EXEC_INPUT_FILE &&
726                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
727 
728                 if (rw)
729                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
730 
731                 flags = O_WRONLY;
732                 if (o == EXEC_OUTPUT_FILE_APPEND)
733                         flags |= O_APPEND;
734                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
735                         flags |= O_TRUNC;
736 
737                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
738                 if (fd < 0)
739                         return fd;
740 
741                 return move_fd(fd, fileno, 0);
742         }
743 
744         default:
745                 assert_not_reached();
746         }
747 }
748 
chown_terminal(int fd,uid_t uid)749 static int chown_terminal(int fd, uid_t uid) {
750         int r;
751 
752         assert(fd >= 0);
753 
754         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
755         if (isatty(fd) < 1) {
756                 if (IN_SET(errno, EINVAL, ENOTTY))
757                         return 0; /* not a tty */
758 
759                 return -errno;
760         }
761 
762         /* This might fail. What matters are the results. */
763         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
764         if (r < 0)
765                 return r;
766 
767         return 1;
768 }
769 
setup_confirm_stdio(const ExecContext * context,const char * vc,int * ret_saved_stdin,int * ret_saved_stdout)770 static int setup_confirm_stdio(
771                 const ExecContext *context,
772                 const char *vc,
773                 int *ret_saved_stdin,
774                 int *ret_saved_stdout) {
775 
776         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
777         int r;
778 
779         assert(ret_saved_stdin);
780         assert(ret_saved_stdout);
781 
782         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
783         if (saved_stdin < 0)
784                 return -errno;
785 
786         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
787         if (saved_stdout < 0)
788                 return -errno;
789 
790         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
791         if (fd < 0)
792                 return fd;
793 
794         r = chown_terminal(fd, getuid());
795         if (r < 0)
796                 return r;
797 
798         r = reset_terminal_fd(fd, true);
799         if (r < 0)
800                 return r;
801 
802         r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
803         if (r < 0)
804                 return r;
805 
806         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
807         TAKE_FD(fd);
808         if (r < 0)
809                 return r;
810 
811         *ret_saved_stdin = TAKE_FD(saved_stdin);
812         *ret_saved_stdout = TAKE_FD(saved_stdout);
813         return 0;
814 }
815 
write_confirm_error_fd(int err,int fd,const Unit * u)816 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
817         assert(err < 0);
818 
819         if (err == -ETIMEDOUT)
820                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
821         else {
822                 errno = -err;
823                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
824         }
825 }
826 
write_confirm_error(int err,const char * vc,const Unit * u)827 static void write_confirm_error(int err, const char *vc, const Unit *u) {
828         _cleanup_close_ int fd = -1;
829 
830         assert(vc);
831 
832         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
833         if (fd < 0)
834                 return;
835 
836         write_confirm_error_fd(err, fd, u);
837 }
838 
restore_confirm_stdio(int * saved_stdin,int * saved_stdout)839 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
840         int r = 0;
841 
842         assert(saved_stdin);
843         assert(saved_stdout);
844 
845         release_terminal();
846 
847         if (*saved_stdin >= 0)
848                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
849                         r = -errno;
850 
851         if (*saved_stdout >= 0)
852                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
853                         r = -errno;
854 
855         *saved_stdin = safe_close(*saved_stdin);
856         *saved_stdout = safe_close(*saved_stdout);
857 
858         return r;
859 }
860 
861 enum {
862         CONFIRM_PRETEND_FAILURE = -1,
863         CONFIRM_PRETEND_SUCCESS =  0,
864         CONFIRM_EXECUTE = 1,
865 };
866 
ask_for_confirmation(const ExecContext * context,const char * vc,Unit * u,const char * cmdline)867 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
868         int saved_stdout = -1, saved_stdin = -1, r;
869         _cleanup_free_ char *e = NULL;
870         char c;
871 
872         /* For any internal errors, assume a positive response. */
873         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
874         if (r < 0) {
875                 write_confirm_error(r, vc, u);
876                 return CONFIRM_EXECUTE;
877         }
878 
879         /* confirm_spawn might have been disabled while we were sleeping. */
880         if (manager_is_confirm_spawn_disabled(u->manager)) {
881                 r = 1;
882                 goto restore_stdio;
883         }
884 
885         e = ellipsize(cmdline, 60, 100);
886         if (!e) {
887                 log_oom();
888                 r = CONFIRM_EXECUTE;
889                 goto restore_stdio;
890         }
891 
892         for (;;) {
893                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
894                 if (r < 0) {
895                         write_confirm_error_fd(r, STDOUT_FILENO, u);
896                         r = CONFIRM_EXECUTE;
897                         goto restore_stdio;
898                 }
899 
900                 switch (c) {
901                 case 'c':
902                         printf("Resuming normal execution.\n");
903                         manager_disable_confirm_spawn();
904                         r = 1;
905                         break;
906                 case 'D':
907                         unit_dump(u, stdout, "  ");
908                         continue; /* ask again */
909                 case 'f':
910                         printf("Failing execution.\n");
911                         r = CONFIRM_PRETEND_FAILURE;
912                         break;
913                 case 'h':
914                         printf("  c - continue, proceed without asking anymore\n"
915                                "  D - dump, show the state of the unit\n"
916                                "  f - fail, don't execute the command and pretend it failed\n"
917                                "  h - help\n"
918                                "  i - info, show a short summary of the unit\n"
919                                "  j - jobs, show jobs that are in progress\n"
920                                "  s - skip, don't execute the command and pretend it succeeded\n"
921                                "  y - yes, execute the command\n");
922                         continue; /* ask again */
923                 case 'i':
924                         printf("  Description: %s\n"
925                                "  Unit:        %s\n"
926                                "  Command:     %s\n",
927                                u->id, u->description, cmdline);
928                         continue; /* ask again */
929                 case 'j':
930                         manager_dump_jobs(u->manager, stdout, "  ");
931                         continue; /* ask again */
932                 case 'n':
933                         /* 'n' was removed in favor of 'f'. */
934                         printf("Didn't understand 'n', did you mean 'f'?\n");
935                         continue; /* ask again */
936                 case 's':
937                         printf("Skipping execution.\n");
938                         r = CONFIRM_PRETEND_SUCCESS;
939                         break;
940                 case 'y':
941                         r = CONFIRM_EXECUTE;
942                         break;
943                 default:
944                         assert_not_reached();
945                 }
946                 break;
947         }
948 
949 restore_stdio:
950         restore_confirm_stdio(&saved_stdin, &saved_stdout);
951         return r;
952 }
953 
get_fixed_user(const ExecContext * c,const char ** user,uid_t * uid,gid_t * gid,const char ** home,const char ** shell)954 static int get_fixed_user(const ExecContext *c, const char **user,
955                           uid_t *uid, gid_t *gid,
956                           const char **home, const char **shell) {
957         int r;
958         const char *name;
959 
960         assert(c);
961 
962         if (!c->user)
963                 return 0;
964 
965         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
966          * (i.e. are "/" or "/bin/nologin"). */
967 
968         name = c->user;
969         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
970         if (r < 0)
971                 return r;
972 
973         *user = name;
974         return 0;
975 }
976 
get_fixed_group(const ExecContext * c,const char ** group,gid_t * gid)977 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
978         int r;
979         const char *name;
980 
981         assert(c);
982 
983         if (!c->group)
984                 return 0;
985 
986         name = c->group;
987         r = get_group_creds(&name, gid, 0);
988         if (r < 0)
989                 return r;
990 
991         *group = name;
992         return 0;
993 }
994 
get_supplementary_groups(const ExecContext * c,const char * user,const char * group,gid_t gid,gid_t ** supplementary_gids,int * ngids)995 static int get_supplementary_groups(const ExecContext *c, const char *user,
996                                     const char *group, gid_t gid,
997                                     gid_t **supplementary_gids, int *ngids) {
998         int r, k = 0;
999         int ngroups_max;
1000         bool keep_groups = false;
1001         gid_t *groups = NULL;
1002         _cleanup_free_ gid_t *l_gids = NULL;
1003 
1004         assert(c);
1005 
1006         /*
1007          * If user is given, then lookup GID and supplementary groups list.
1008          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1009          * here and as early as possible so we keep the list of supplementary
1010          * groups of the caller.
1011          */
1012         if (user && gid_is_valid(gid) && gid != 0) {
1013                 /* First step, initialize groups from /etc/groups */
1014                 if (initgroups(user, gid) < 0)
1015                         return -errno;
1016 
1017                 keep_groups = true;
1018         }
1019 
1020         if (strv_isempty(c->supplementary_groups))
1021                 return 0;
1022 
1023         /*
1024          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1025          * be positive, otherwise fail.
1026          */
1027         errno = 0;
1028         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1029         if (ngroups_max <= 0)
1030                 return errno_or_else(EOPNOTSUPP);
1031 
1032         l_gids = new(gid_t, ngroups_max);
1033         if (!l_gids)
1034                 return -ENOMEM;
1035 
1036         if (keep_groups) {
1037                 /*
1038                  * Lookup the list of groups that the user belongs to, we
1039                  * avoid NSS lookups here too for gid=0.
1040                  */
1041                 k = ngroups_max;
1042                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1043                         return -EINVAL;
1044         } else
1045                 k = 0;
1046 
1047         STRV_FOREACH(i, c->supplementary_groups) {
1048                 const char *g;
1049 
1050                 if (k >= ngroups_max)
1051                         return -E2BIG;
1052 
1053                 g = *i;
1054                 r = get_group_creds(&g, l_gids+k, 0);
1055                 if (r < 0)
1056                         return r;
1057 
1058                 k++;
1059         }
1060 
1061         /*
1062          * Sets ngids to zero to drop all supplementary groups, happens
1063          * when we are under root and SupplementaryGroups= is empty.
1064          */
1065         if (k == 0) {
1066                 *ngids = 0;
1067                 return 0;
1068         }
1069 
1070         /* Otherwise get the final list of supplementary groups */
1071         groups = memdup(l_gids, sizeof(gid_t) * k);
1072         if (!groups)
1073                 return -ENOMEM;
1074 
1075         *supplementary_gids = groups;
1076         *ngids = k;
1077 
1078         groups = NULL;
1079 
1080         return 0;
1081 }
1082 
enforce_groups(gid_t gid,const gid_t * supplementary_gids,int ngids)1083 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1084         int r;
1085 
1086         /* Handle SupplementaryGroups= if it is not empty */
1087         if (ngids > 0) {
1088                 r = maybe_setgroups(ngids, supplementary_gids);
1089                 if (r < 0)
1090                         return r;
1091         }
1092 
1093         if (gid_is_valid(gid)) {
1094                 /* Then set our gids */
1095                 if (setresgid(gid, gid, gid) < 0)
1096                         return -errno;
1097         }
1098 
1099         return 0;
1100 }
1101 
set_securebits(int bits,int mask)1102 static int set_securebits(int bits, int mask) {
1103         int current, applied;
1104         current = prctl(PR_GET_SECUREBITS);
1105         if (current < 0)
1106                 return -errno;
1107         /* Clear all securebits defined in mask and set bits */
1108         applied = (current & ~mask) | bits;
1109         if (current == applied)
1110                 return 0;
1111         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1112                 return -errno;
1113         return 1;
1114 }
1115 
enforce_user(const ExecContext * context,uid_t uid)1116 static int enforce_user(const ExecContext *context, uid_t uid) {
1117         assert(context);
1118         int r;
1119 
1120         if (!uid_is_valid(uid))
1121                 return 0;
1122 
1123         /* Sets (but doesn't look up) the uid and make sure we keep the
1124          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1125          * required, so we also need keep-caps in this case.
1126          */
1127 
1128         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1129 
1130                 /* First step: If we need to keep capabilities but
1131                  * drop privileges we need to make sure we keep our
1132                  * caps, while we drop privileges. */
1133                 if (uid != 0) {
1134                         /* Add KEEP_CAPS to the securebits */
1135                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1136                         if (r < 0)
1137                                 return r;
1138                 }
1139         }
1140 
1141         /* Second step: actually set the uids */
1142         if (setresuid(uid, uid, uid) < 0)
1143                 return -errno;
1144 
1145         /* At this point we should have all necessary capabilities but
1146            are otherwise a normal user. However, the caps might got
1147            corrupted due to the setresuid() so we need clean them up
1148            later. This is done outside of this call. */
1149 
1150         return 0;
1151 }
1152 
1153 #if HAVE_PAM
1154 
null_conv(int num_msg,const struct pam_message ** msg,struct pam_response ** resp,void * appdata_ptr)1155 static int null_conv(
1156                 int num_msg,
1157                 const struct pam_message **msg,
1158                 struct pam_response **resp,
1159                 void *appdata_ptr) {
1160 
1161         /* We don't support conversations */
1162 
1163         return PAM_CONV_ERR;
1164 }
1165 
1166 #endif
1167 
setup_pam(const char * name,const char * user,uid_t uid,gid_t gid,const char * tty,char *** env,const int fds[],size_t n_fds)1168 static int setup_pam(
1169                 const char *name,
1170                 const char *user,
1171                 uid_t uid,
1172                 gid_t gid,
1173                 const char *tty,
1174                 char ***env, /* updated on success */
1175                 const int fds[], size_t n_fds) {
1176 
1177 #if HAVE_PAM
1178 
1179         static const struct pam_conv conv = {
1180                 .conv = null_conv,
1181                 .appdata_ptr = NULL
1182         };
1183 
1184         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1185         _cleanup_strv_free_ char **e = NULL;
1186         pam_handle_t *handle = NULL;
1187         sigset_t old_ss;
1188         int pam_code = PAM_SUCCESS, r;
1189         bool close_session = false;
1190         pid_t pam_pid = 0, parent_pid;
1191         int flags = 0;
1192 
1193         assert(name);
1194         assert(user);
1195         assert(env);
1196 
1197         /* We set up PAM in the parent process, then fork. The child
1198          * will then stay around until killed via PR_GET_PDEATHSIG or
1199          * systemd via the cgroup logic. It will then remove the PAM
1200          * session again. The parent process will exec() the actual
1201          * daemon. We do things this way to ensure that the main PID
1202          * of the daemon is the one we initially fork()ed. */
1203 
1204         r = barrier_create(&barrier);
1205         if (r < 0)
1206                 goto fail;
1207 
1208         if (log_get_max_level() < LOG_DEBUG)
1209                 flags |= PAM_SILENT;
1210 
1211         pam_code = pam_start(name, user, &conv, &handle);
1212         if (pam_code != PAM_SUCCESS) {
1213                 handle = NULL;
1214                 goto fail;
1215         }
1216 
1217         if (!tty) {
1218                 _cleanup_free_ char *q = NULL;
1219 
1220                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1221                  * out if that's the case, and read the TTY off it. */
1222 
1223                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1224                         tty = strjoina("/dev/", q);
1225         }
1226 
1227         if (tty) {
1228                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1229                 if (pam_code != PAM_SUCCESS)
1230                         goto fail;
1231         }
1232 
1233         STRV_FOREACH(nv, *env) {
1234                 pam_code = pam_putenv(handle, *nv);
1235                 if (pam_code != PAM_SUCCESS)
1236                         goto fail;
1237         }
1238 
1239         pam_code = pam_acct_mgmt(handle, flags);
1240         if (pam_code != PAM_SUCCESS)
1241                 goto fail;
1242 
1243         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1244         if (pam_code != PAM_SUCCESS)
1245                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1246 
1247         pam_code = pam_open_session(handle, flags);
1248         if (pam_code != PAM_SUCCESS)
1249                 goto fail;
1250 
1251         close_session = true;
1252 
1253         e = pam_getenvlist(handle);
1254         if (!e) {
1255                 pam_code = PAM_BUF_ERR;
1256                 goto fail;
1257         }
1258 
1259         /* Block SIGTERM, so that we know that it won't get lost in the child */
1260 
1261         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1262 
1263         parent_pid = getpid_cached();
1264 
1265         r = safe_fork("(sd-pam)", 0, &pam_pid);
1266         if (r < 0)
1267                 goto fail;
1268         if (r == 0) {
1269                 int sig, ret = EXIT_PAM;
1270 
1271                 /* The child's job is to reset the PAM session on termination */
1272                 barrier_set_role(&barrier, BARRIER_CHILD);
1273 
1274                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1275                  * those fds are open here that have been opened by PAM. */
1276                 (void) close_many(fds, n_fds);
1277 
1278                 /* Drop privileges - we don't need any to pam_close_session and this will make
1279                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1280                  * threads to fail to exit normally */
1281 
1282                 r = maybe_setgroups(0, NULL);
1283                 if (r < 0)
1284                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1285                 if (setresgid(gid, gid, gid) < 0)
1286                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1287                 if (setresuid(uid, uid, uid) < 0)
1288                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1289 
1290                 (void) ignore_signals(SIGPIPE);
1291 
1292                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1293                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1294                  * this way. We rely on the control groups kill logic to do the rest for us. */
1295                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1296                         goto child_finish;
1297 
1298                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1299                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1300                  *
1301                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1302                 (void) barrier_place(&barrier);
1303 
1304                 /* Check if our parent process might already have died? */
1305                 if (getppid() == parent_pid) {
1306                         sigset_t ss;
1307 
1308                         assert_se(sigemptyset(&ss) >= 0);
1309                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1310 
1311                         for (;;) {
1312                                 if (sigwait(&ss, &sig) < 0) {
1313                                         if (errno == EINTR)
1314                                                 continue;
1315 
1316                                         goto child_finish;
1317                                 }
1318 
1319                                 assert(sig == SIGTERM);
1320                                 break;
1321                         }
1322                 }
1323 
1324                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1325                 if (pam_code != PAM_SUCCESS)
1326                         goto child_finish;
1327 
1328                 /* If our parent died we'll end the session */
1329                 if (getppid() != parent_pid) {
1330                         pam_code = pam_close_session(handle, flags);
1331                         if (pam_code != PAM_SUCCESS)
1332                                 goto child_finish;
1333                 }
1334 
1335                 ret = 0;
1336 
1337         child_finish:
1338                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1339                  * know about this. See pam_end(3) */
1340                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1341                 _exit(ret);
1342         }
1343 
1344         barrier_set_role(&barrier, BARRIER_PARENT);
1345 
1346         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1347          * here. */
1348         handle = NULL;
1349 
1350         /* Unblock SIGTERM again in the parent */
1351         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1352 
1353         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1354          * this fd around. */
1355         closelog();
1356 
1357         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1358          * recover. However, warn loudly if it happens. */
1359         if (!barrier_place_and_sync(&barrier))
1360                 log_error("PAM initialization failed");
1361 
1362         return strv_free_and_replace(*env, e);
1363 
1364 fail:
1365         if (pam_code != PAM_SUCCESS) {
1366                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1367                 r = -EPERM;  /* PAM errors do not map to errno */
1368         } else
1369                 log_error_errno(r, "PAM failed: %m");
1370 
1371         if (handle) {
1372                 if (close_session)
1373                         pam_code = pam_close_session(handle, flags);
1374 
1375                 (void) pam_end(handle, pam_code | flags);
1376         }
1377 
1378         closelog();
1379         return r;
1380 #else
1381         return 0;
1382 #endif
1383 }
1384 
rename_process_from_path(const char * path)1385 static void rename_process_from_path(const char *path) {
1386         char process_name[11];
1387         const char *p;
1388         size_t l;
1389 
1390         /* This resulting string must fit in 10 chars (i.e. the length
1391          * of "/sbin/init") to look pretty in /bin/ps */
1392 
1393         p = basename(path);
1394         if (isempty(p)) {
1395                 rename_process("(...)");
1396                 return;
1397         }
1398 
1399         l = strlen(p);
1400         if (l > 8) {
1401                 /* The end of the process name is usually more
1402                  * interesting, since the first bit might just be
1403                  * "systemd-" */
1404                 p = p + l - 8;
1405                 l = 8;
1406         }
1407 
1408         process_name[0] = '(';
1409         memcpy(process_name+1, p, l);
1410         process_name[1+l] = ')';
1411         process_name[1+l+1] = 0;
1412 
1413         rename_process(process_name);
1414 }
1415 
context_has_address_families(const ExecContext * c)1416 static bool context_has_address_families(const ExecContext *c) {
1417         assert(c);
1418 
1419         return c->address_families_allow_list ||
1420                 !set_isempty(c->address_families);
1421 }
1422 
context_has_syscall_filters(const ExecContext * c)1423 static bool context_has_syscall_filters(const ExecContext *c) {
1424         assert(c);
1425 
1426         return c->syscall_allow_list ||
1427                 !hashmap_isempty(c->syscall_filter);
1428 }
1429 
context_has_syscall_logs(const ExecContext * c)1430 static bool context_has_syscall_logs(const ExecContext *c) {
1431         assert(c);
1432 
1433         return c->syscall_log_allow_list ||
1434                 !hashmap_isempty(c->syscall_log);
1435 }
1436 
context_has_no_new_privileges(const ExecContext * c)1437 static bool context_has_no_new_privileges(const ExecContext *c) {
1438         assert(c);
1439 
1440         if (c->no_new_privileges)
1441                 return true;
1442 
1443         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1444                 return false;
1445 
1446         /* We need NNP if we have any form of seccomp and are unprivileged */
1447         return c->lock_personality ||
1448                 c->memory_deny_write_execute ||
1449                 c->private_devices ||
1450                 c->protect_clock ||
1451                 c->protect_hostname ||
1452                 c->protect_kernel_tunables ||
1453                 c->protect_kernel_modules ||
1454                 c->protect_kernel_logs ||
1455                 context_has_address_families(c) ||
1456                 exec_context_restrict_namespaces_set(c) ||
1457                 c->restrict_realtime ||
1458                 c->restrict_suid_sgid ||
1459                 !set_isempty(c->syscall_archs) ||
1460                 context_has_syscall_filters(c) ||
1461                 context_has_syscall_logs(c);
1462 }
1463 
exec_context_has_credentials(const ExecContext * context)1464 static bool exec_context_has_credentials(const ExecContext *context) {
1465 
1466         assert(context);
1467 
1468         return !hashmap_isempty(context->set_credentials) ||
1469                 !hashmap_isempty(context->load_credentials);
1470 }
1471 
1472 #if HAVE_SECCOMP
1473 
skip_seccomp_unavailable(const Unit * u,const char * msg)1474 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1475 
1476         if (is_seccomp_available())
1477                 return false;
1478 
1479         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1480         return true;
1481 }
1482 
apply_syscall_filter(const Unit * u,const ExecContext * c,bool needs_ambient_hack)1483 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1484         uint32_t negative_action, default_action, action;
1485         int r;
1486 
1487         assert(u);
1488         assert(c);
1489 
1490         if (!context_has_syscall_filters(c))
1491                 return 0;
1492 
1493         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1494                 return 0;
1495 
1496         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1497 
1498         if (c->syscall_allow_list) {
1499                 default_action = negative_action;
1500                 action = SCMP_ACT_ALLOW;
1501         } else {
1502                 default_action = SCMP_ACT_ALLOW;
1503                 action = negative_action;
1504         }
1505 
1506         if (needs_ambient_hack) {
1507                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1508                 if (r < 0)
1509                         return r;
1510         }
1511 
1512         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1513 }
1514 
apply_syscall_log(const Unit * u,const ExecContext * c)1515 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1516 #ifdef SCMP_ACT_LOG
1517         uint32_t default_action, action;
1518 #endif
1519 
1520         assert(u);
1521         assert(c);
1522 
1523         if (!context_has_syscall_logs(c))
1524                 return 0;
1525 
1526 #ifdef SCMP_ACT_LOG
1527         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1528                 return 0;
1529 
1530         if (c->syscall_log_allow_list) {
1531                 /* Log nothing but the ones listed */
1532                 default_action = SCMP_ACT_ALLOW;
1533                 action = SCMP_ACT_LOG;
1534         } else {
1535                 /* Log everything but the ones listed */
1536                 default_action = SCMP_ACT_LOG;
1537                 action = SCMP_ACT_ALLOW;
1538         }
1539 
1540         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1541 #else
1542         /* old libseccomp */
1543         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1544         return 0;
1545 #endif
1546 }
1547 
apply_syscall_archs(const Unit * u,const ExecContext * c)1548 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1549         assert(u);
1550         assert(c);
1551 
1552         if (set_isempty(c->syscall_archs))
1553                 return 0;
1554 
1555         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1556                 return 0;
1557 
1558         return seccomp_restrict_archs(c->syscall_archs);
1559 }
1560 
apply_address_families(const Unit * u,const ExecContext * c)1561 static int apply_address_families(const Unit* u, const ExecContext *c) {
1562         assert(u);
1563         assert(c);
1564 
1565         if (!context_has_address_families(c))
1566                 return 0;
1567 
1568         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1569                 return 0;
1570 
1571         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1572 }
1573 
apply_memory_deny_write_execute(const Unit * u,const ExecContext * c)1574 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1575         assert(u);
1576         assert(c);
1577 
1578         if (!c->memory_deny_write_execute)
1579                 return 0;
1580 
1581         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1582                 return 0;
1583 
1584         return seccomp_memory_deny_write_execute();
1585 }
1586 
apply_restrict_realtime(const Unit * u,const ExecContext * c)1587 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1588         assert(u);
1589         assert(c);
1590 
1591         if (!c->restrict_realtime)
1592                 return 0;
1593 
1594         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1595                 return 0;
1596 
1597         return seccomp_restrict_realtime();
1598 }
1599 
apply_restrict_suid_sgid(const Unit * u,const ExecContext * c)1600 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1601         assert(u);
1602         assert(c);
1603 
1604         if (!c->restrict_suid_sgid)
1605                 return 0;
1606 
1607         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1608                 return 0;
1609 
1610         return seccomp_restrict_suid_sgid();
1611 }
1612 
apply_protect_sysctl(const Unit * u,const ExecContext * c)1613 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1614         assert(u);
1615         assert(c);
1616 
1617         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1618          * let's protect even those systems where this is left on in the kernel. */
1619 
1620         if (!c->protect_kernel_tunables)
1621                 return 0;
1622 
1623         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1624                 return 0;
1625 
1626         return seccomp_protect_sysctl();
1627 }
1628 
apply_protect_kernel_modules(const Unit * u,const ExecContext * c)1629 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1630         assert(u);
1631         assert(c);
1632 
1633         /* Turn off module syscalls on ProtectKernelModules=yes */
1634 
1635         if (!c->protect_kernel_modules)
1636                 return 0;
1637 
1638         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1639                 return 0;
1640 
1641         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1642 }
1643 
apply_protect_kernel_logs(const Unit * u,const ExecContext * c)1644 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1645         assert(u);
1646         assert(c);
1647 
1648         if (!c->protect_kernel_logs)
1649                 return 0;
1650 
1651         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1652                 return 0;
1653 
1654         return seccomp_protect_syslog();
1655 }
1656 
apply_protect_clock(const Unit * u,const ExecContext * c)1657 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1658         assert(u);
1659         assert(c);
1660 
1661         if (!c->protect_clock)
1662                 return 0;
1663 
1664         if (skip_seccomp_unavailable(u, "ProtectClock="))
1665                 return 0;
1666 
1667         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1668 }
1669 
apply_private_devices(const Unit * u,const ExecContext * c)1670 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1671         assert(u);
1672         assert(c);
1673 
1674         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1675 
1676         if (!c->private_devices)
1677                 return 0;
1678 
1679         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1680                 return 0;
1681 
1682         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1683 }
1684 
apply_restrict_namespaces(const Unit * u,const ExecContext * c)1685 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1686         assert(u);
1687         assert(c);
1688 
1689         if (!exec_context_restrict_namespaces_set(c))
1690                 return 0;
1691 
1692         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1693                 return 0;
1694 
1695         return seccomp_restrict_namespaces(c->restrict_namespaces);
1696 }
1697 
apply_lock_personality(const Unit * u,const ExecContext * c)1698 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1699         unsigned long personality;
1700         int r;
1701 
1702         assert(u);
1703         assert(c);
1704 
1705         if (!c->lock_personality)
1706                 return 0;
1707 
1708         if (skip_seccomp_unavailable(u, "LockPersonality="))
1709                 return 0;
1710 
1711         personality = c->personality;
1712 
1713         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1714         if (personality == PERSONALITY_INVALID) {
1715 
1716                 r = opinionated_personality(&personality);
1717                 if (r < 0)
1718                         return r;
1719         }
1720 
1721         return seccomp_lock_personality(personality);
1722 }
1723 
1724 #endif
1725 
1726 #if HAVE_LIBBPF
apply_restrict_filesystems(Unit * u,const ExecContext * c)1727 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1728         assert(u);
1729         assert(c);
1730 
1731         if (!exec_context_restrict_filesystems_set(c))
1732                 return 0;
1733 
1734         if (!u->manager->restrict_fs) {
1735                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1736                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1737                 return 0;
1738         }
1739 
1740         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1741 }
1742 #endif
1743 
apply_protect_hostname(const Unit * u,const ExecContext * c,int * ret_exit_status)1744 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1745         assert(u);
1746         assert(c);
1747 
1748         if (!c->protect_hostname)
1749                 return 0;
1750 
1751         if (ns_type_supported(NAMESPACE_UTS)) {
1752                 if (unshare(CLONE_NEWUTS) < 0) {
1753                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1754                                 *ret_exit_status = EXIT_NAMESPACE;
1755                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1756                         }
1757 
1758                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1759                 }
1760         } else
1761                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1762 
1763 #if HAVE_SECCOMP
1764         int r;
1765 
1766         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1767                 return 0;
1768 
1769         r = seccomp_protect_hostname();
1770         if (r < 0) {
1771                 *ret_exit_status = EXIT_SECCOMP;
1772                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1773         }
1774 #endif
1775 
1776         return 0;
1777 }
1778 
do_idle_pipe_dance(int idle_pipe[static4])1779 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1780         assert(idle_pipe);
1781 
1782         idle_pipe[1] = safe_close(idle_pipe[1]);
1783         idle_pipe[2] = safe_close(idle_pipe[2]);
1784 
1785         if (idle_pipe[0] >= 0) {
1786                 int r;
1787 
1788                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1789 
1790                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1791                         ssize_t n;
1792 
1793                         /* Signal systemd that we are bored and want to continue. */
1794                         n = write(idle_pipe[3], "x", 1);
1795                         if (n > 0)
1796                                 /* Wait for systemd to react to the signal above. */
1797                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1798                 }
1799 
1800                 idle_pipe[0] = safe_close(idle_pipe[0]);
1801 
1802         }
1803 
1804         idle_pipe[3] = safe_close(idle_pipe[3]);
1805 }
1806 
1807 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1808 
build_environment(const Unit * u,const ExecContext * c,const ExecParameters * p,size_t n_fds,const char * home,const char * username,const char * shell,dev_t journal_stream_dev,ino_t journal_stream_ino,char *** ret)1809 static int build_environment(
1810                 const Unit *u,
1811                 const ExecContext *c,
1812                 const ExecParameters *p,
1813                 size_t n_fds,
1814                 const char *home,
1815                 const char *username,
1816                 const char *shell,
1817                 dev_t journal_stream_dev,
1818                 ino_t journal_stream_ino,
1819                 char ***ret) {
1820 
1821         _cleanup_strv_free_ char **our_env = NULL;
1822         size_t n_env = 0;
1823         char *x;
1824 
1825         assert(u);
1826         assert(c);
1827         assert(p);
1828         assert(ret);
1829 
1830 #define N_ENV_VARS 17
1831         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1832         if (!our_env)
1833                 return -ENOMEM;
1834 
1835         if (n_fds > 0) {
1836                 _cleanup_free_ char *joined = NULL;
1837 
1838                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1839                         return -ENOMEM;
1840                 our_env[n_env++] = x;
1841 
1842                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1843                         return -ENOMEM;
1844                 our_env[n_env++] = x;
1845 
1846                 joined = strv_join(p->fd_names, ":");
1847                 if (!joined)
1848                         return -ENOMEM;
1849 
1850                 x = strjoin("LISTEN_FDNAMES=", joined);
1851                 if (!x)
1852                         return -ENOMEM;
1853                 our_env[n_env++] = x;
1854         }
1855 
1856         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1857                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1858                         return -ENOMEM;
1859                 our_env[n_env++] = x;
1860 
1861                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1862                         return -ENOMEM;
1863                 our_env[n_env++] = x;
1864         }
1865 
1866         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1867          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1868          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1869         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1870                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1871                 if (!x)
1872                         return -ENOMEM;
1873                 our_env[n_env++] = x;
1874         }
1875 
1876         if (home) {
1877                 x = strjoin("HOME=", home);
1878                 if (!x)
1879                         return -ENOMEM;
1880 
1881                 path_simplify(x + 5);
1882                 our_env[n_env++] = x;
1883         }
1884 
1885         if (username) {
1886                 x = strjoin("LOGNAME=", username);
1887                 if (!x)
1888                         return -ENOMEM;
1889                 our_env[n_env++] = x;
1890 
1891                 x = strjoin("USER=", username);
1892                 if (!x)
1893                         return -ENOMEM;
1894                 our_env[n_env++] = x;
1895         }
1896 
1897         if (shell) {
1898                 x = strjoin("SHELL=", shell);
1899                 if (!x)
1900                         return -ENOMEM;
1901 
1902                 path_simplify(x + 6);
1903                 our_env[n_env++] = x;
1904         }
1905 
1906         if (!sd_id128_is_null(u->invocation_id)) {
1907                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1908                         return -ENOMEM;
1909 
1910                 our_env[n_env++] = x;
1911         }
1912 
1913         if (exec_context_needs_term(c)) {
1914                 const char *tty_path, *term = NULL;
1915 
1916                 tty_path = exec_context_tty_path(c);
1917 
1918                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1919                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1920                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1921 
1922                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1923                         term = getenv("TERM");
1924 
1925                 if (!term)
1926                         term = default_term_for_tty(tty_path);
1927 
1928                 x = strjoin("TERM=", term);
1929                 if (!x)
1930                         return -ENOMEM;
1931                 our_env[n_env++] = x;
1932         }
1933 
1934         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1935                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1936                         return -ENOMEM;
1937 
1938                 our_env[n_env++] = x;
1939         }
1940 
1941         if (c->log_namespace) {
1942                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1943                 if (!x)
1944                         return -ENOMEM;
1945 
1946                 our_env[n_env++] = x;
1947         }
1948 
1949         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1950                 _cleanup_free_ char *joined = NULL;
1951                 const char *n;
1952 
1953                 if (!p->prefix[t])
1954                         continue;
1955 
1956                 if (c->directories[t].n_items == 0)
1957                         continue;
1958 
1959                 n = exec_directory_env_name_to_string(t);
1960                 if (!n)
1961                         continue;
1962 
1963                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1964                         _cleanup_free_ char *prefixed = NULL;
1965 
1966                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1967                         if (!prefixed)
1968                                 return -ENOMEM;
1969 
1970                         if (!strextend_with_separator(&joined, ":", prefixed))
1971                                 return -ENOMEM;
1972                 }
1973 
1974                 x = strjoin(n, "=", joined);
1975                 if (!x)
1976                         return -ENOMEM;
1977 
1978                 our_env[n_env++] = x;
1979         }
1980 
1981         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1982                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1983                 if (!x)
1984                         return -ENOMEM;
1985 
1986                 our_env[n_env++] = x;
1987         }
1988 
1989         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1990                 return -ENOMEM;
1991 
1992         our_env[n_env++] = x;
1993 
1994         our_env[n_env++] = NULL;
1995         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1996 #undef N_ENV_VARS
1997 
1998         *ret = TAKE_PTR(our_env);
1999 
2000         return 0;
2001 }
2002 
build_pass_environment(const ExecContext * c,char *** ret)2003 static int build_pass_environment(const ExecContext *c, char ***ret) {
2004         _cleanup_strv_free_ char **pass_env = NULL;
2005         size_t n_env = 0;
2006 
2007         STRV_FOREACH(i, c->pass_environment) {
2008                 _cleanup_free_ char *x = NULL;
2009                 char *v;
2010 
2011                 v = getenv(*i);
2012                 if (!v)
2013                         continue;
2014                 x = strjoin(*i, "=", v);
2015                 if (!x)
2016                         return -ENOMEM;
2017 
2018                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2019                         return -ENOMEM;
2020 
2021                 pass_env[n_env++] = TAKE_PTR(x);
2022                 pass_env[n_env] = NULL;
2023         }
2024 
2025         *ret = TAKE_PTR(pass_env);
2026 
2027         return 0;
2028 }
2029 
exec_needs_mount_namespace(const ExecContext * context,const ExecParameters * params,const ExecRuntime * runtime)2030 bool exec_needs_mount_namespace(
2031                 const ExecContext *context,
2032                 const ExecParameters *params,
2033                 const ExecRuntime *runtime) {
2034 
2035         assert(context);
2036 
2037         if (context->root_image)
2038                 return true;
2039 
2040         if (!strv_isempty(context->read_write_paths) ||
2041             !strv_isempty(context->read_only_paths) ||
2042             !strv_isempty(context->inaccessible_paths) ||
2043             !strv_isempty(context->exec_paths) ||
2044             !strv_isempty(context->no_exec_paths))
2045                 return true;
2046 
2047         if (context->n_bind_mounts > 0)
2048                 return true;
2049 
2050         if (context->n_temporary_filesystems > 0)
2051                 return true;
2052 
2053         if (context->n_mount_images > 0)
2054                 return true;
2055 
2056         if (context->n_extension_images > 0)
2057                 return true;
2058 
2059         if (!strv_isempty(context->extension_directories))
2060                 return true;
2061 
2062         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2063                 return true;
2064 
2065         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2066                 return true;
2067 
2068         if (context->private_devices ||
2069             context->private_mounts ||
2070             context->protect_system != PROTECT_SYSTEM_NO ||
2071             context->protect_home != PROTECT_HOME_NO ||
2072             context->protect_kernel_tunables ||
2073             context->protect_kernel_modules ||
2074             context->protect_kernel_logs ||
2075             context->protect_control_groups ||
2076             context->protect_proc != PROTECT_PROC_DEFAULT ||
2077             context->proc_subset != PROC_SUBSET_ALL ||
2078             context->private_ipc ||
2079             context->ipc_namespace_path)
2080                 return true;
2081 
2082         if (context->root_directory) {
2083                 if (exec_context_get_effective_mount_apivfs(context))
2084                         return true;
2085 
2086                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2087                         if (params && !params->prefix[t])
2088                                 continue;
2089 
2090                         if (context->directories[t].n_items > 0)
2091                                 return true;
2092                 }
2093         }
2094 
2095         if (context->dynamic_user &&
2096             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2097              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2098              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2099                 return true;
2100 
2101         if (context->log_namespace)
2102                 return true;
2103 
2104         return false;
2105 }
2106 
setup_private_users(uid_t ouid,gid_t ogid,uid_t uid,gid_t gid)2107 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2108         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2109         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2110         _cleanup_close_ int unshare_ready_fd = -1;
2111         _cleanup_(sigkill_waitp) pid_t pid = 0;
2112         uint64_t c = 1;
2113         ssize_t n;
2114         int r;
2115 
2116         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2117          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2118          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2119          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2120          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2121          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2122          * continues execution normally.
2123          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2124          * does not need CAP_SETUID to write the single line mapping to itself. */
2125 
2126         /* Can only set up multiple mappings with CAP_SETUID. */
2127         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2128                 r = asprintf(&uid_map,
2129                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2130                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2131                              ouid, ouid, uid, uid);
2132         else
2133                 r = asprintf(&uid_map,
2134                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2135                              ouid, ouid);
2136 
2137         if (r < 0)
2138                 return -ENOMEM;
2139 
2140         /* Can only set up multiple mappings with CAP_SETGID. */
2141         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2142                 r = asprintf(&gid_map,
2143                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2144                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2145                              ogid, ogid, gid, gid);
2146         else
2147                 r = asprintf(&gid_map,
2148                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2149                              ogid, ogid);
2150 
2151         if (r < 0)
2152                 return -ENOMEM;
2153 
2154         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2155          * namespace. */
2156         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2157         if (unshare_ready_fd < 0)
2158                 return -errno;
2159 
2160         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2161          * failed. */
2162         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2163                 return -errno;
2164 
2165         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2166         if (r < 0)
2167                 return r;
2168         if (r == 0) {
2169                 _cleanup_close_ int fd = -1;
2170                 const char *a;
2171                 pid_t ppid;
2172 
2173                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2174                  * here, after the parent opened its own user namespace. */
2175 
2176                 ppid = getppid();
2177                 errno_pipe[0] = safe_close(errno_pipe[0]);
2178 
2179                 /* Wait until the parent unshared the user namespace */
2180                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2181                         r = -errno;
2182                         goto child_fail;
2183                 }
2184 
2185                 /* Disable the setgroups() system call in the child user namespace, for good. */
2186                 a = procfs_file_alloca(ppid, "setgroups");
2187                 fd = open(a, O_WRONLY|O_CLOEXEC);
2188                 if (fd < 0) {
2189                         if (errno != ENOENT) {
2190                                 r = -errno;
2191                                 goto child_fail;
2192                         }
2193 
2194                         /* If the file is missing the kernel is too old, let's continue anyway. */
2195                 } else {
2196                         if (write(fd, "deny\n", 5) < 0) {
2197                                 r = -errno;
2198                                 goto child_fail;
2199                         }
2200 
2201                         fd = safe_close(fd);
2202                 }
2203 
2204                 /* First write the GID map */
2205                 a = procfs_file_alloca(ppid, "gid_map");
2206                 fd = open(a, O_WRONLY|O_CLOEXEC);
2207                 if (fd < 0) {
2208                         r = -errno;
2209                         goto child_fail;
2210                 }
2211                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2212                         r = -errno;
2213                         goto child_fail;
2214                 }
2215                 fd = safe_close(fd);
2216 
2217                 /* The write the UID map */
2218                 a = procfs_file_alloca(ppid, "uid_map");
2219                 fd = open(a, O_WRONLY|O_CLOEXEC);
2220                 if (fd < 0) {
2221                         r = -errno;
2222                         goto child_fail;
2223                 }
2224                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2225                         r = -errno;
2226                         goto child_fail;
2227                 }
2228 
2229                 _exit(EXIT_SUCCESS);
2230 
2231         child_fail:
2232                 (void) write(errno_pipe[1], &r, sizeof(r));
2233                 _exit(EXIT_FAILURE);
2234         }
2235 
2236         errno_pipe[1] = safe_close(errno_pipe[1]);
2237 
2238         if (unshare(CLONE_NEWUSER) < 0)
2239                 return -errno;
2240 
2241         /* Let the child know that the namespace is ready now */
2242         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2243                 return -errno;
2244 
2245         /* Try to read an error code from the child */
2246         n = read(errno_pipe[0], &r, sizeof(r));
2247         if (n < 0)
2248                 return -errno;
2249         if (n == sizeof(r)) { /* an error code was sent to us */
2250                 if (r < 0)
2251                         return r;
2252                 return -EIO;
2253         }
2254         if (n != 0) /* on success we should have read 0 bytes */
2255                 return -EIO;
2256 
2257         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2258         if (r < 0)
2259                 return r;
2260         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2261                 return -EIO;
2262 
2263         return 0;
2264 }
2265 
exec_directory_is_private(const ExecContext * context,ExecDirectoryType type)2266 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2267         if (!context->dynamic_user)
2268                 return false;
2269 
2270         if (type == EXEC_DIRECTORY_CONFIGURATION)
2271                 return false;
2272 
2273         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2274                 return false;
2275 
2276         return true;
2277 }
2278 
create_many_symlinks(const char * root,const char * source,char ** symlinks)2279 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2280         _cleanup_free_ char *src_abs = NULL;
2281         int r;
2282 
2283         assert(source);
2284 
2285         src_abs = path_join(root, source);
2286         if (!src_abs)
2287                 return -ENOMEM;
2288 
2289         STRV_FOREACH(dst, symlinks) {
2290                 _cleanup_free_ char *dst_abs = NULL;
2291 
2292                 dst_abs = path_join(root, *dst);
2293                 if (!dst_abs)
2294                         return -ENOMEM;
2295 
2296                 r = mkdir_parents_label(dst_abs, 0755);
2297                 if (r < 0)
2298                         return r;
2299 
2300                 r = symlink_idempotent(src_abs, dst_abs, true);
2301                 if (r < 0)
2302                         return r;
2303         }
2304 
2305         return 0;
2306 }
2307 
setup_exec_directory(const ExecContext * context,const ExecParameters * params,uid_t uid,gid_t gid,ExecDirectoryType type,bool needs_mount_namespace,int * exit_status)2308 static int setup_exec_directory(
2309                 const ExecContext *context,
2310                 const ExecParameters *params,
2311                 uid_t uid,
2312                 gid_t gid,
2313                 ExecDirectoryType type,
2314                 bool needs_mount_namespace,
2315                 int *exit_status) {
2316 
2317         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2318                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2319                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2320                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2321                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2322                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2323         };
2324         int r;
2325 
2326         assert(context);
2327         assert(params);
2328         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2329         assert(exit_status);
2330 
2331         if (!params->prefix[type])
2332                 return 0;
2333 
2334         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2335                 if (!uid_is_valid(uid))
2336                         uid = 0;
2337                 if (!gid_is_valid(gid))
2338                         gid = 0;
2339         }
2340 
2341         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2342                 _cleanup_free_ char *p = NULL, *pp = NULL;
2343 
2344                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2345                 if (!p) {
2346                         r = -ENOMEM;
2347                         goto fail;
2348                 }
2349 
2350                 r = mkdir_parents_label(p, 0755);
2351                 if (r < 0)
2352                         goto fail;
2353 
2354                 if (exec_directory_is_private(context, type)) {
2355                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2356                          * case we want to avoid leaving a directory around fully accessible that is owned by
2357                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2358                          * trick used by container managers to prohibit host users to get access to files of
2359                          * the same UID in containers: we place everything inside a directory that has an
2360                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2361                          * for unprivileged host code. We then use fs namespacing to make this directory
2362                          * permeable for the service itself.
2363                          *
2364                          * Specifically: for a service which wants a special directory "foo/" we first create
2365                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2366                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2367                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2368                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2369                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2370                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2371                          * for the service and making sure it only gets access to the dirs it needs but no
2372                          * others. Tricky? Yes, absolutely, but it works!
2373                          *
2374                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2375                          * to be owned by the service itself.
2376                          *
2377                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2378                          * for sharing files or sockets with other services. */
2379 
2380                         pp = path_join(params->prefix[type], "private");
2381                         if (!pp) {
2382                                 r = -ENOMEM;
2383                                 goto fail;
2384                         }
2385 
2386                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2387                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2388                         if (r < 0)
2389                                 goto fail;
2390 
2391                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2392                                 r = -ENOMEM;
2393                                 goto fail;
2394                         }
2395 
2396                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2397                         r = mkdir_parents_label(pp, 0755);
2398                         if (r < 0)
2399                                 goto fail;
2400 
2401                         if (is_dir(p, false) > 0 &&
2402                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2403 
2404                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2405                                  * it over. Most likely the service has been upgraded from one that didn't use
2406                                  * DynamicUser=1, to one that does. */
2407 
2408                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2409                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2410                                          exec_directory_type_to_string(type), p, pp);
2411 
2412                                 if (rename(p, pp) < 0) {
2413                                         r = -errno;
2414                                         goto fail;
2415                                 }
2416                         } else {
2417                                 /* Otherwise, create the actual directory for the service */
2418 
2419                                 r = mkdir_label(pp, context->directories[type].mode);
2420                                 if (r < 0 && r != -EEXIST)
2421                                         goto fail;
2422                         }
2423 
2424                         /* And link it up from the original place. Note that if a mount namespace is going to be
2425                          * used, then this symlink remains on the host, and a new one for the child namespace will
2426                          * be created later. */
2427                         r = symlink_idempotent(pp, p, true);
2428                         if (r < 0)
2429                                 goto fail;
2430 
2431                 } else {
2432                         _cleanup_free_ char *target = NULL;
2433 
2434                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2435                             readlink_and_make_absolute(p, &target) >= 0) {
2436                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2437 
2438                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2439                                  * by DynamicUser=1 (see above)?
2440                                  *
2441                                  * We do this for all directory types except for ConfigurationDirectory=,
2442                                  * since they all support the private/ symlink logic at least in some
2443                                  * configurations, see above. */
2444 
2445                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2446                                 if (r < 0)
2447                                         goto fail;
2448 
2449                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2450                                 if (!q) {
2451                                         r = -ENOMEM;
2452                                         goto fail;
2453                                 }
2454 
2455                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2456                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2457                                 if (r < 0)
2458                                         goto fail;
2459 
2460                                 if (path_equal(q_resolved, target_resolved)) {
2461 
2462                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2463                                          * but is no longer. Let's move the directory back up. */
2464 
2465                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2466                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2467                                                  exec_directory_type_to_string(type), q, p);
2468 
2469                                         if (unlink(p) < 0) {
2470                                                 r = -errno;
2471                                                 goto fail;
2472                                         }
2473 
2474                                         if (rename(q, p) < 0) {
2475                                                 r = -errno;
2476                                                 goto fail;
2477                                         }
2478                                 }
2479                         }
2480 
2481                         r = mkdir_label(p, context->directories[type].mode);
2482                         if (r < 0) {
2483                                 if (r != -EEXIST)
2484                                         goto fail;
2485 
2486                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2487                                         struct stat st;
2488 
2489                                         /* Don't change the owner/access mode of the configuration directory,
2490                                          * as in the common case it is not written to by a service, and shall
2491                                          * not be writable. */
2492 
2493                                         if (stat(p, &st) < 0) {
2494                                                 r = -errno;
2495                                                 goto fail;
2496                                         }
2497 
2498                                         /* Still complain if the access mode doesn't match */
2499                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2500                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2501                                                             "(File system: %o %sMode: %o)",
2502                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2503                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2504 
2505                                         continue;
2506                                 }
2507                         }
2508                 }
2509 
2510                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2511                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2512                  * current UID/GID ownership.) */
2513                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2514                 if (r < 0)
2515                         goto fail;
2516 
2517                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2518                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2519                  * assignments to exist. */
2520                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2521                 if (r < 0)
2522                         goto fail;
2523         }
2524 
2525         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2526          * they are set up later, to allow configuring empty var/run/etc. */
2527         if (!needs_mount_namespace)
2528                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2529                         r = create_many_symlinks(params->prefix[type],
2530                                                  context->directories[type].items[i].path,
2531                                                  context->directories[type].items[i].symlinks);
2532                         if (r < 0)
2533                                 goto fail;
2534                 }
2535 
2536         return 0;
2537 
2538 fail:
2539         *exit_status = exit_status_table[type];
2540         return r;
2541 }
2542 
write_credential(int dfd,const char * id,const void * data,size_t size,uid_t uid,bool ownership_ok)2543 static int write_credential(
2544                 int dfd,
2545                 const char *id,
2546                 const void *data,
2547                 size_t size,
2548                 uid_t uid,
2549                 bool ownership_ok) {
2550 
2551         _cleanup_(unlink_and_freep) char *tmp = NULL;
2552         _cleanup_close_ int fd = -1;
2553         int r;
2554 
2555         r = tempfn_random_child("", "cred", &tmp);
2556         if (r < 0)
2557                 return r;
2558 
2559         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2560         if (fd < 0) {
2561                 tmp = mfree(tmp);
2562                 return -errno;
2563         }
2564 
2565         r = loop_write(fd, data, size, /* do_poll = */ false);
2566         if (r < 0)
2567                 return r;
2568 
2569         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2570                 return -errno;
2571 
2572         if (uid_is_valid(uid) && uid != getuid()) {
2573                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2574                 if (r < 0) {
2575                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2576                                 return r;
2577 
2578                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2579                                             * to express: that the user gets read access and nothing
2580                                             * else. But if the backing fs can't support that (e.g. ramfs)
2581                                             * then we can use file ownership instead. But that's only safe if
2582                                             * we can then re-mount the whole thing read-only, so that the
2583                                             * user can no longer chmod() the file to gain write access. */
2584                                 return r;
2585 
2586                         if (fchown(fd, uid, GID_INVALID) < 0)
2587                                 return -errno;
2588                 }
2589         }
2590 
2591         if (renameat(dfd, tmp, dfd, id) < 0)
2592                 return -errno;
2593 
2594         tmp = mfree(tmp);
2595         return 0;
2596 }
2597 
credential_search_path(const ExecParameters * params,bool encrypted)2598 static char **credential_search_path(
2599                 const ExecParameters *params,
2600                 bool encrypted) {
2601 
2602         _cleanup_strv_free_ char **l = NULL;
2603 
2604         assert(params);
2605 
2606         /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2607          * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2608          * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2609 
2610         if (encrypted) {
2611                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2612                         return NULL;
2613 
2614                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2615                         return NULL;
2616         }
2617 
2618         if (params->received_credentials_directory)
2619                 if (strv_extend(&l, params->received_credentials_directory) < 0)
2620                         return NULL;
2621 
2622         if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2623                 return NULL;
2624 
2625         if (DEBUG_LOGGING) {
2626                 _cleanup_free_ char *t = strv_join(l, ":");
2627 
2628                 log_debug("Credential search path is: %s", t);
2629         }
2630 
2631         return TAKE_PTR(l);
2632 }
2633 
load_credential(const ExecContext * context,const ExecParameters * params,const char * id,const char * path,bool encrypted,const char * unit,int read_dfd,int write_dfd,uid_t uid,bool ownership_ok,uint64_t * left)2634 static int load_credential(
2635                 const ExecContext *context,
2636                 const ExecParameters *params,
2637                 const char *id,
2638                 const char *path,
2639                 bool encrypted,
2640                 const char *unit,
2641                 int read_dfd,
2642                 int write_dfd,
2643                 uid_t uid,
2644                 bool ownership_ok,
2645                 uint64_t *left) {
2646 
2647         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2648         _cleanup_strv_free_ char **search_path = NULL;
2649         _cleanup_(erase_and_freep) char *data = NULL;
2650         _cleanup_free_ char *bindname = NULL;
2651         const char *source = NULL;
2652         bool missing_ok = true;
2653         size_t size, add, maxsz;
2654         int r;
2655 
2656         assert(context);
2657         assert(params);
2658         assert(id);
2659         assert(path);
2660         assert(unit);
2661         assert(write_dfd >= 0);
2662         assert(left);
2663 
2664         if (read_dfd >= 0) {
2665                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2666                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2667                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2668                  * open it. */
2669 
2670                 if (!filename_is_valid(path)) /* safety check */
2671                         return -EINVAL;
2672 
2673                 missing_ok = true;
2674                 source = path;
2675 
2676         } else if (path_is_absolute(path)) {
2677                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2678                  * sockets */
2679 
2680                 if (!path_is_valid(path)) /* safety check */
2681                         return -EINVAL;
2682 
2683                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2684 
2685                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2686                  * via the source socket address in case we read off an AF_UNIX socket. */
2687                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2688                         return -ENOMEM;
2689 
2690                 missing_ok = false;
2691                 source = path;
2692 
2693         } else if (credential_name_valid(path)) {
2694                 /* If this is a relative path, take it as credential name relative to the credentials
2695                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2696                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2697 
2698                 search_path = credential_search_path(params, encrypted);
2699                 if (!search_path)
2700                         return -ENOMEM;
2701 
2702                 missing_ok = true;
2703         } else
2704                 source = NULL;
2705 
2706         if (encrypted)
2707                 flags |= READ_FULL_FILE_UNBASE64;
2708 
2709         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2710 
2711         if (search_path) {
2712                 STRV_FOREACH(d, search_path) {
2713                         _cleanup_free_ char *j = NULL;
2714 
2715                         j = path_join(*d, path);
2716                         if (!j)
2717                                 return -ENOMEM;
2718 
2719                         r = read_full_file_full(
2720                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2721                                         UINT64_MAX,
2722                                         maxsz,
2723                                         flags,
2724                                         NULL,
2725                                         &data, &size);
2726                         if (r != -ENOENT)
2727                                 break;
2728                 }
2729         } else if (source)
2730                 r = read_full_file_full(
2731                                 read_dfd, source,
2732                                 UINT64_MAX,
2733                                 maxsz,
2734                                 flags,
2735                                 bindname,
2736                                 &data, &size);
2737         else
2738                 r = -ENOENT;
2739 
2740         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2741                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2742                  * will get clear errors if we don't pass such a missing credential on as they
2743                  * themselves will get ENOENT when trying to read them, which should not be much
2744                  * worse than when we handle the error here and make it fatal.
2745                  *
2746                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2747                  * we are fine, too. */
2748                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2749                 return 0;
2750         }
2751         if (r < 0)
2752                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2753 
2754         if (encrypted) {
2755                 _cleanup_free_ void *plaintext = NULL;
2756                 size_t plaintext_size = 0;
2757 
2758                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2759                 if (r < 0)
2760                         return r;
2761 
2762                 free_and_replace(data, plaintext);
2763                 size = plaintext_size;
2764         }
2765 
2766         add = strlen(id) + size;
2767         if (add > *left)
2768                 return -E2BIG;
2769 
2770         r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2771         if (r < 0)
2772                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2773 
2774         *left -= add;
2775         return 0;
2776 }
2777 
2778 struct load_cred_args {
2779         const ExecContext *context;
2780         const ExecParameters *params;
2781         bool encrypted;
2782         const char *unit;
2783         int dfd;
2784         uid_t uid;
2785         bool ownership_ok;
2786         uint64_t *left;
2787 };
2788 
load_cred_recurse_dir_cb(RecurseDirEvent event,const char * path,int dir_fd,int inode_fd,const struct dirent * de,const struct statx * sx,void * userdata)2789 static int load_cred_recurse_dir_cb(
2790                 RecurseDirEvent event,
2791                 const char *path,
2792                 int dir_fd,
2793                 int inode_fd,
2794                 const struct dirent *de,
2795                 const struct statx *sx,
2796                 void *userdata) {
2797 
2798         struct load_cred_args *args = ASSERT_PTR(userdata);
2799         _cleanup_free_ char *sub_id = NULL;
2800         int r;
2801 
2802         if (event != RECURSE_DIR_ENTRY)
2803                 return RECURSE_DIR_CONTINUE;
2804 
2805         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2806                 return RECURSE_DIR_CONTINUE;
2807 
2808         sub_id = strreplace(path, "/", "_");
2809         if (!sub_id)
2810                 return -ENOMEM;
2811 
2812         if (!credential_name_valid(sub_id))
2813                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2814 
2815         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2816                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2817                 return RECURSE_DIR_CONTINUE;
2818         }
2819         if (errno != ENOENT)
2820                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2821 
2822         r = load_credential(
2823                         args->context,
2824                         args->params,
2825                         sub_id,
2826                         de->d_name,
2827                         args->encrypted,
2828                         args->unit,
2829                         dir_fd,
2830                         args->dfd,
2831                         args->uid,
2832                         args->ownership_ok,
2833                         args->left);
2834         if (r < 0)
2835                 return r;
2836 
2837         return RECURSE_DIR_CONTINUE;
2838 }
2839 
acquire_credentials(const ExecContext * context,const ExecParameters * params,const char * unit,const char * p,uid_t uid,bool ownership_ok)2840 static int acquire_credentials(
2841                 const ExecContext *context,
2842                 const ExecParameters *params,
2843                 const char *unit,
2844                 const char *p,
2845                 uid_t uid,
2846                 bool ownership_ok) {
2847 
2848         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2849         _cleanup_close_ int dfd = -1;
2850         ExecLoadCredential *lc;
2851         ExecSetCredential *sc;
2852         int r;
2853 
2854         assert(context);
2855         assert(p);
2856 
2857         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2858         if (dfd < 0)
2859                 return -errno;
2860 
2861         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2862         HASHMAP_FOREACH(lc, context->load_credentials) {
2863                 _cleanup_close_ int sub_fd = -1;
2864 
2865                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2866                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2867                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
2868                  * propagate a credential passed to us from further up. */
2869 
2870                 if (path_is_absolute(lc->path)) {
2871                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2872                         if (sub_fd < 0 && !IN_SET(errno,
2873                                                   ENOTDIR,  /* Not a directory */
2874                                                   ENOENT))  /* Doesn't exist? */
2875                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2876                 }
2877 
2878                 if (sub_fd < 0)
2879                         /* Regular file (incl. a credential passed in from higher up) */
2880                         r = load_credential(
2881                                         context,
2882                                         params,
2883                                         lc->id,
2884                                         lc->path,
2885                                         lc->encrypted,
2886                                         unit,
2887                                         -1,
2888                                         dfd,
2889                                         uid,
2890                                         ownership_ok,
2891                                         &left);
2892                 else
2893                         /* Directory */
2894                         r = recurse_dir(
2895                                         sub_fd,
2896                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2897                                         /* statx_mask= */ 0,
2898                                         /* n_depth_max= */ UINT_MAX,
2899                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2900                                         load_cred_recurse_dir_cb,
2901                                         &(struct load_cred_args) {
2902                                                 .context = context,
2903                                                 .params = params,
2904                                                 .encrypted = lc->encrypted,
2905                                                 .unit = unit,
2906                                                 .dfd = dfd,
2907                                                 .uid = uid,
2908                                                 .ownership_ok = ownership_ok,
2909                                                 .left = &left,
2910                                         });
2911                 if (r < 0)
2912                         return r;
2913         }
2914 
2915         /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2916          * them, so that they can act as a "default" if the same credential is specified multiple times. */
2917         HASHMAP_FOREACH(sc, context->set_credentials) {
2918                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2919                 const char *data;
2920                 size_t size, add;
2921 
2922                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2923                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2924                  * slow and involved, hence it's nice to be able to skip that if the credential already
2925                  * exists anyway. */
2926                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2927                         continue;
2928                 if (errno != ENOENT)
2929                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2930 
2931                 if (sc->encrypted) {
2932                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2933                         if (r < 0)
2934                                 return r;
2935 
2936                         data = plaintext;
2937                 } else {
2938                         data = sc->data;
2939                         size = sc->size;
2940                 }
2941 
2942                 add = strlen(sc->id) + size;
2943                 if (add > left)
2944                         return -E2BIG;
2945 
2946                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2947                 if (r < 0)
2948                         return r;
2949 
2950                 left -= add;
2951         }
2952 
2953         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2954                 return -errno;
2955 
2956         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2957          * accessible */
2958 
2959         if (uid_is_valid(uid) && uid != getuid()) {
2960                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2961                 if (r < 0) {
2962                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2963                                 return r;
2964 
2965                         if (!ownership_ok)
2966                                 return r;
2967 
2968                         if (fchown(dfd, uid, GID_INVALID) < 0)
2969                                 return -errno;
2970                 }
2971         }
2972 
2973         return 0;
2974 }
2975 
setup_credentials_internal(const ExecContext * context,const ExecParameters * params,const char * unit,const char * final,const char * workspace,bool reuse_workspace,bool must_mount,uid_t uid)2976 static int setup_credentials_internal(
2977                 const ExecContext *context,
2978                 const ExecParameters *params,
2979                 const char *unit,
2980                 const char *final,        /* This is where the credential store shall eventually end up at */
2981                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2982                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2983                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2984                 uid_t uid) {
2985 
2986         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2987                                    * if we mounted something; false if we definitely can't mount anything */
2988         bool final_mounted;
2989         const char *where;
2990 
2991         assert(context);
2992         assert(final);
2993         assert(workspace);
2994 
2995         if (reuse_workspace) {
2996                 r = path_is_mount_point(workspace, NULL, 0);
2997                 if (r < 0)
2998                         return r;
2999                 if (r > 0)
3000                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3001                 else
3002                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3003         } else
3004                 workspace_mounted = -1; /* ditto */
3005 
3006         r = path_is_mount_point(final, NULL, 0);
3007         if (r < 0)
3008                 return r;
3009         if (r > 0) {
3010                 /* If the final place already has something mounted, we use that. If the workspace also has
3011                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3012                  * different). */
3013                 final_mounted = true;
3014 
3015                 if (workspace_mounted < 0) {
3016                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3017                          * the final version to the workspace, and make it writable, so that we can make
3018                          * changes */
3019 
3020                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3021                         if (r < 0)
3022                                 return r;
3023 
3024                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3025                         if (r < 0)
3026                                 return r;
3027 
3028                         workspace_mounted = true;
3029                 }
3030         } else
3031                 final_mounted = false;
3032 
3033         if (workspace_mounted < 0) {
3034                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3035                 for (int try = 0;; try++) {
3036 
3037                         if (try == 0) {
3038                                 /* Try "ramfs" first, since it's not swap backed */
3039                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3040                                 if (r >= 0) {
3041                                         workspace_mounted = true;
3042                                         break;
3043                                 }
3044 
3045                         } else if (try == 1) {
3046                                 _cleanup_free_ char *opts = NULL;
3047 
3048                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3049                                         return -ENOMEM;
3050 
3051                                 /* Fall back to "tmpfs" otherwise */
3052                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3053                                 if (r >= 0) {
3054                                         workspace_mounted = true;
3055                                         break;
3056                                 }
3057 
3058                         } else {
3059                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3060                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3061                                 if (r < 0) {
3062                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3063                                                 return r;
3064 
3065                                         if (must_mount) /* If we it's not OK to use the plain directory
3066                                                          * fallback, propagate all errors too */
3067                                                 return r;
3068 
3069                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3070                                          * proceed for compat with container envs, and just use the final dir
3071                                          * as is. */
3072 
3073                                         workspace_mounted = false;
3074                                         break;
3075                                 }
3076 
3077                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3078                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3079                                 if (r < 0)
3080                                         return r;
3081 
3082                                 workspace_mounted = true;
3083                                 break;
3084                         }
3085                 }
3086         }
3087 
3088         assert(!must_mount || workspace_mounted > 0);
3089         where = workspace_mounted ? workspace : final;
3090 
3091         (void) label_fix_container(where, final, 0);
3092 
3093         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3094         if (r < 0)
3095                 return r;
3096 
3097         if (workspace_mounted) {
3098                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3099                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3100                 if (r < 0)
3101                         return r;
3102 
3103                 /* And mount it to the final place, read-only */
3104                 if (final_mounted)
3105                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3106                 else
3107                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3108                 if (r < 0)
3109                         return r;
3110         } else {
3111                 _cleanup_free_ char *parent = NULL;
3112 
3113                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3114                  * open access to the top-level credential directory and the per-service directory now */
3115 
3116                 parent = dirname_malloc(final);
3117                 if (!parent)
3118                         return -ENOMEM;
3119                 if (chmod(parent, 0755) < 0)
3120                         return -errno;
3121         }
3122 
3123         return 0;
3124 }
3125 
setup_credentials(const ExecContext * context,const ExecParameters * params,const char * unit,uid_t uid)3126 static int setup_credentials(
3127                 const ExecContext *context,
3128                 const ExecParameters *params,
3129                 const char *unit,
3130                 uid_t uid) {
3131 
3132         _cleanup_free_ char *p = NULL, *q = NULL;
3133         int r;
3134 
3135         assert(context);
3136         assert(params);
3137 
3138         if (!exec_context_has_credentials(context))
3139                 return 0;
3140 
3141         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3142                 return -EINVAL;
3143 
3144         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3145          * and the subdir we mount over with a read-only file system readable by the service's user */
3146         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3147         if (!q)
3148                 return -ENOMEM;
3149 
3150         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3151         if (r < 0 && r != -EEXIST)
3152                 return r;
3153 
3154         p = path_join(q, unit);
3155         if (!p)
3156                 return -ENOMEM;
3157 
3158         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3159         if (r < 0 && r != -EEXIST)
3160                 return r;
3161 
3162         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3163         if (r < 0) {
3164                 _cleanup_free_ char *t = NULL, *u = NULL;
3165 
3166                 /* If this is not a privilege or support issue then propagate the error */
3167                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3168                         return r;
3169 
3170                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3171                  * it into place, so that users can't access half-initialized credential stores. */
3172                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3173                 if (!t)
3174                         return -ENOMEM;
3175 
3176                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3177                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3178                  * after it is fully set up */
3179                 u = path_join(t, unit);
3180                 if (!u)
3181                         return -ENOMEM;
3182 
3183                 FOREACH_STRING(i, t, u) {
3184                         r = mkdir_label(i, 0700);
3185                         if (r < 0 && r != -EEXIST)
3186                                 return r;
3187                 }
3188 
3189                 r = setup_credentials_internal(
3190                                 context,
3191                                 params,
3192                                 unit,
3193                                 p,       /* final mount point */
3194                                 u,       /* temporary workspace to overmount */
3195                                 true,    /* reuse the workspace if it is already a mount */
3196                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3197                                 uid);
3198 
3199                 (void) rmdir(u); /* remove the workspace again if we can. */
3200 
3201                 if (r < 0)
3202                         return r;
3203 
3204         } else if (r == 0) {
3205 
3206                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3207                  * we can use the same directory for all cases, after turning off propagation. Question
3208                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3209                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3210                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3211                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3212                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3213                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3214                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3215                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3216                  * propagation on the former, and then overmount the latter.
3217                  *
3218                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3219                  * for this purpose, but there are few other candidates that work equally well for us, and
3220                  * given that the we do this in a privately namespaced short-lived single-threaded process
3221                  * that no one else sees this should be OK to do. */
3222 
3223                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3224                 if (r < 0)
3225                         goto child_fail;
3226 
3227                 r = setup_credentials_internal(
3228                                 context,
3229                                 params,
3230                                 unit,
3231                                 p,           /* final mount point */
3232                                 "/dev/shm",  /* temporary workspace to overmount */
3233                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3234                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3235                                 uid);
3236                 if (r < 0)
3237                         goto child_fail;
3238 
3239                 _exit(EXIT_SUCCESS);
3240 
3241         child_fail:
3242                 _exit(EXIT_FAILURE);
3243         }
3244 
3245         return 0;
3246 }
3247 
3248 #if ENABLE_SMACK
setup_smack(const ExecContext * context,int executable_fd)3249 static int setup_smack(
3250                 const ExecContext *context,
3251                 int executable_fd) {
3252         int r;
3253 
3254         assert(context);
3255         assert(executable_fd >= 0);
3256 
3257         if (context->smack_process_label) {
3258                 r = mac_smack_apply_pid(0, context->smack_process_label);
3259                 if (r < 0)
3260                         return r;
3261         }
3262 #ifdef SMACK_DEFAULT_PROCESS_LABEL
3263         else {
3264                 _cleanup_free_ char *exec_label = NULL;
3265 
3266                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3267                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
3268                         return r;
3269 
3270                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3271                 if (r < 0)
3272                         return r;
3273         }
3274 #endif
3275 
3276         return 0;
3277 }
3278 #endif
3279 
compile_bind_mounts(const ExecContext * context,const ExecParameters * params,BindMount ** ret_bind_mounts,size_t * ret_n_bind_mounts,char *** ret_empty_directories)3280 static int compile_bind_mounts(
3281                 const ExecContext *context,
3282                 const ExecParameters *params,
3283                 BindMount **ret_bind_mounts,
3284                 size_t *ret_n_bind_mounts,
3285                 char ***ret_empty_directories) {
3286 
3287         _cleanup_strv_free_ char **empty_directories = NULL;
3288         BindMount *bind_mounts;
3289         size_t n, h = 0;
3290         int r;
3291 
3292         assert(context);
3293         assert(params);
3294         assert(ret_bind_mounts);
3295         assert(ret_n_bind_mounts);
3296         assert(ret_empty_directories);
3297 
3298         n = context->n_bind_mounts;
3299         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3300                 if (!params->prefix[t])
3301                         continue;
3302 
3303                 n += context->directories[t].n_items;
3304         }
3305 
3306         if (n <= 0) {
3307                 *ret_bind_mounts = NULL;
3308                 *ret_n_bind_mounts = 0;
3309                 *ret_empty_directories = NULL;
3310                 return 0;
3311         }
3312 
3313         bind_mounts = new(BindMount, n);
3314         if (!bind_mounts)
3315                 return -ENOMEM;
3316 
3317         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3318                 BindMount *item = context->bind_mounts + i;
3319                 char *s, *d;
3320 
3321                 s = strdup(item->source);
3322                 if (!s) {
3323                         r = -ENOMEM;
3324                         goto finish;
3325                 }
3326 
3327                 d = strdup(item->destination);
3328                 if (!d) {
3329                         free(s);
3330                         r = -ENOMEM;
3331                         goto finish;
3332                 }
3333 
3334                 bind_mounts[h++] = (BindMount) {
3335                         .source = s,
3336                         .destination = d,
3337                         .read_only = item->read_only,
3338                         .recursive = item->recursive,
3339                         .ignore_enoent = item->ignore_enoent,
3340                 };
3341         }
3342 
3343         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3344                 if (!params->prefix[t])
3345                         continue;
3346 
3347                 if (context->directories[t].n_items == 0)
3348                         continue;
3349 
3350                 if (exec_directory_is_private(context, t) &&
3351                     !exec_context_with_rootfs(context)) {
3352                         char *private_root;
3353 
3354                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3355                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3356                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3357 
3358                         private_root = path_join(params->prefix[t], "private");
3359                         if (!private_root) {
3360                                 r = -ENOMEM;
3361                                 goto finish;
3362                         }
3363 
3364                         r = strv_consume(&empty_directories, private_root);
3365                         if (r < 0)
3366                                 goto finish;
3367                 }
3368 
3369                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3370                         char *s, *d;
3371 
3372                         if (exec_directory_is_private(context, t))
3373                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3374                         else
3375                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3376                         if (!s) {
3377                                 r = -ENOMEM;
3378                                 goto finish;
3379                         }
3380 
3381                         if (exec_directory_is_private(context, t) &&
3382                             exec_context_with_rootfs(context))
3383                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3384                                  * directory is not created on the root directory. So, let's bind-mount the directory
3385                                  * on the 'non-private' place. */
3386                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3387                         else
3388                                 d = strdup(s);
3389                         if (!d) {
3390                                 free(s);
3391                                 r = -ENOMEM;
3392                                 goto finish;
3393                         }
3394 
3395                         bind_mounts[h++] = (BindMount) {
3396                                 .source = s,
3397                                 .destination = d,
3398                                 .read_only = false,
3399                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3400                                 .recursive = true,
3401                                 .ignore_enoent = false,
3402                         };
3403                 }
3404         }
3405 
3406         assert(h == n);
3407 
3408         *ret_bind_mounts = bind_mounts;
3409         *ret_n_bind_mounts = n;
3410         *ret_empty_directories = TAKE_PTR(empty_directories);
3411 
3412         return (int) n;
3413 
3414 finish:
3415         bind_mount_free_many(bind_mounts, h);
3416         return r;
3417 }
3418 
3419 /* ret_symlinks will contain a list of pairs src:dest that describes
3420  * the symlinks to create later on. For example, the symlinks needed
3421  * to safely give private directories to DynamicUser=1 users. */
compile_symlinks(const ExecContext * context,const ExecParameters * params,char *** ret_symlinks)3422 static int compile_symlinks(
3423                 const ExecContext *context,
3424                 const ExecParameters *params,
3425                 char ***ret_symlinks) {
3426 
3427         _cleanup_strv_free_ char **symlinks = NULL;
3428         int r;
3429 
3430         assert(context);
3431         assert(params);
3432         assert(ret_symlinks);
3433 
3434         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3435                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3436                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3437 
3438                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3439                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3440 
3441                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3442                                 dst_abs = path_join(params->prefix[dt], *symlink);
3443                                 if (!src_abs || !dst_abs)
3444                                         return -ENOMEM;
3445 
3446                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3447                                 if (r < 0)
3448                                         return r;
3449                         }
3450 
3451                         if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context))
3452                                 continue;
3453 
3454                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3455                         if (!private_path)
3456                                 return -ENOMEM;
3457 
3458                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3459                         if (!path)
3460                                 return -ENOMEM;
3461 
3462                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3463                         if (r < 0)
3464                                 return r;
3465                 }
3466         }
3467 
3468         *ret_symlinks = TAKE_PTR(symlinks);
3469 
3470         return 0;
3471 }
3472 
insist_on_sandboxing(const ExecContext * context,const char * root_dir,const char * root_image,const BindMount * bind_mounts,size_t n_bind_mounts)3473 static bool insist_on_sandboxing(
3474                 const ExecContext *context,
3475                 const char *root_dir,
3476                 const char *root_image,
3477                 const BindMount *bind_mounts,
3478                 size_t n_bind_mounts) {
3479 
3480         assert(context);
3481         assert(n_bind_mounts == 0 || bind_mounts);
3482 
3483         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3484          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3485          * rearrange stuff in a way we cannot ignore gracefully. */
3486 
3487         if (context->n_temporary_filesystems > 0)
3488                 return true;
3489 
3490         if (root_dir || root_image)
3491                 return true;
3492 
3493         if (context->n_mount_images > 0)
3494                 return true;
3495 
3496         if (context->dynamic_user)
3497                 return true;
3498 
3499         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3500                 return true;
3501 
3502         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3503          * essential. */
3504         for (size_t i = 0; i < n_bind_mounts; i++)
3505                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3506                         return true;
3507 
3508         if (context->log_namespace)
3509                 return true;
3510 
3511         return false;
3512 }
3513 
apply_mount_namespace(const Unit * u,ExecCommandFlags command_flags,const ExecContext * context,const ExecParameters * params,const ExecRuntime * runtime,char ** error_path)3514 static int apply_mount_namespace(
3515                 const Unit *u,
3516                 ExecCommandFlags command_flags,
3517                 const ExecContext *context,
3518                 const ExecParameters *params,
3519                 const ExecRuntime *runtime,
3520                 char **error_path) {
3521 
3522         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3523         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3524         const char *root_dir = NULL, *root_image = NULL;
3525         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3526                         *extension_dir = NULL;
3527         NamespaceInfo ns_info;
3528         bool needs_sandboxing;
3529         BindMount *bind_mounts = NULL;
3530         size_t n_bind_mounts = 0;
3531         int r;
3532 
3533         assert(context);
3534 
3535         if (params->flags & EXEC_APPLY_CHROOT) {
3536                 root_image = context->root_image;
3537 
3538                 if (!root_image)
3539                         root_dir = context->root_directory;
3540         }
3541 
3542         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3543         if (r < 0)
3544                 return r;
3545 
3546         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3547         r = compile_symlinks(context, params, &symlinks);
3548         if (r < 0)
3549                 return r;
3550 
3551         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3552         if (needs_sandboxing) {
3553                 /* The runtime struct only contains the parent of the private /tmp,
3554                  * which is non-accessible to world users. Inside of it there's a /tmp
3555                  * that is sticky, and that's the one we want to use here.
3556                  * This does not apply when we are using /run/systemd/empty as fallback. */
3557 
3558                 if (context->private_tmp && runtime) {
3559                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3560                                 tmp_dir = runtime->tmp_dir;
3561                         else if (runtime->tmp_dir)
3562                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3563 
3564                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3565                                 var_tmp_dir = runtime->var_tmp_dir;
3566                         else if (runtime->var_tmp_dir)
3567                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3568                 }
3569 
3570                 ns_info = (NamespaceInfo) {
3571                         .ignore_protect_paths = false,
3572                         .private_dev = context->private_devices,
3573                         .protect_control_groups = context->protect_control_groups,
3574                         .protect_kernel_tunables = context->protect_kernel_tunables,
3575                         .protect_kernel_modules = context->protect_kernel_modules,
3576                         .protect_kernel_logs = context->protect_kernel_logs,
3577                         .protect_hostname = context->protect_hostname,
3578                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3579                         .private_mounts = context->private_mounts,
3580                         .protect_home = context->protect_home,
3581                         .protect_system = context->protect_system,
3582                         .protect_proc = context->protect_proc,
3583                         .proc_subset = context->proc_subset,
3584                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3585                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3586                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3587                 };
3588         } else if (!context->dynamic_user && root_dir)
3589                 /*
3590                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3591                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3592                  * fail if we are enable to apply the sandbox inside the mount namespace.
3593                  */
3594                 ns_info = (NamespaceInfo) {
3595                         .ignore_protect_paths = true,
3596                 };
3597         else
3598                 ns_info = (NamespaceInfo) {};
3599 
3600         if (context->mount_flags == MS_SHARED)
3601                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3602 
3603         if (exec_context_has_credentials(context) &&
3604             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3605             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3606                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3607                 if (!creds_path) {
3608                         r = -ENOMEM;
3609                         goto finalize;
3610                 }
3611         }
3612 
3613         if (MANAGER_IS_SYSTEM(u->manager)) {
3614                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3615                 if (!propagate_dir) {
3616                         r = -ENOMEM;
3617                         goto finalize;
3618                 }
3619 
3620                 incoming_dir = strdup("/run/systemd/incoming");
3621                 if (!incoming_dir) {
3622                         r = -ENOMEM;
3623                         goto finalize;
3624                 }
3625 
3626                 extension_dir = strdup("/run/systemd/unit-extensions");
3627                 if (!extension_dir) {
3628                         r = -ENOMEM;
3629                         goto finalize;
3630                 }
3631         } else
3632                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3633                         r = -ENOMEM;
3634                         goto finalize;
3635                 }
3636 
3637         r = setup_namespace(root_dir, root_image, context->root_image_options,
3638                             &ns_info, context->read_write_paths,
3639                             needs_sandboxing ? context->read_only_paths : NULL,
3640                             needs_sandboxing ? context->inaccessible_paths : NULL,
3641                             needs_sandboxing ? context->exec_paths : NULL,
3642                             needs_sandboxing ? context->no_exec_paths : NULL,
3643                             empty_directories,
3644                             symlinks,
3645                             bind_mounts,
3646                             n_bind_mounts,
3647                             context->temporary_filesystems,
3648                             context->n_temporary_filesystems,
3649                             context->mount_images,
3650                             context->n_mount_images,
3651                             tmp_dir,
3652                             var_tmp_dir,
3653                             creds_path,
3654                             context->log_namespace,
3655                             context->mount_flags,
3656                             context->root_hash, context->root_hash_size, context->root_hash_path,
3657                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3658                             context->root_verity,
3659                             context->extension_images,
3660                             context->n_extension_images,
3661                             context->extension_directories,
3662                             propagate_dir,
3663                             incoming_dir,
3664                             extension_dir,
3665                             root_dir || root_image ? params->notify_socket : NULL,
3666                             error_path);
3667 
3668         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3669          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3670          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3671          * completely different execution environment. */
3672         if (r == -ENOANO) {
3673                 if (insist_on_sandboxing(
3674                                     context,
3675                                     root_dir, root_image,
3676                                     bind_mounts,
3677                                     n_bind_mounts)) {
3678                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3679                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3680                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3681 
3682                         r = -EOPNOTSUPP;
3683                 } else {
3684                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3685                         r = 0;
3686                 }
3687         }
3688 
3689 finalize:
3690         bind_mount_free_many(bind_mounts, n_bind_mounts);
3691         return r;
3692 }
3693 
apply_working_directory(const ExecContext * context,const ExecParameters * params,const char * home,int * exit_status)3694 static int apply_working_directory(
3695                 const ExecContext *context,
3696                 const ExecParameters *params,
3697                 const char *home,
3698                 int *exit_status) {
3699 
3700         const char *d, *wd;
3701 
3702         assert(context);
3703         assert(exit_status);
3704 
3705         if (context->working_directory_home) {
3706 
3707                 if (!home) {
3708                         *exit_status = EXIT_CHDIR;
3709                         return -ENXIO;
3710                 }
3711 
3712                 wd = home;
3713 
3714         } else
3715                 wd = empty_to_root(context->working_directory);
3716 
3717         if (params->flags & EXEC_APPLY_CHROOT)
3718                 d = wd;
3719         else
3720                 d = prefix_roota(context->root_directory, wd);
3721 
3722         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3723                 *exit_status = EXIT_CHDIR;
3724                 return -errno;
3725         }
3726 
3727         return 0;
3728 }
3729 
apply_root_directory(const ExecContext * context,const ExecParameters * params,const bool needs_mount_ns,int * exit_status)3730 static int apply_root_directory(
3731                 const ExecContext *context,
3732                 const ExecParameters *params,
3733                 const bool needs_mount_ns,
3734                 int *exit_status) {
3735 
3736         assert(context);
3737         assert(exit_status);
3738 
3739         if (params->flags & EXEC_APPLY_CHROOT)
3740                 if (!needs_mount_ns && context->root_directory)
3741                         if (chroot(context->root_directory) < 0) {
3742                                 *exit_status = EXIT_CHROOT;
3743                                 return -errno;
3744                         }
3745 
3746         return 0;
3747 }
3748 
setup_keyring(const Unit * u,const ExecContext * context,const ExecParameters * p,uid_t uid,gid_t gid)3749 static int setup_keyring(
3750                 const Unit *u,
3751                 const ExecContext *context,
3752                 const ExecParameters *p,
3753                 uid_t uid, gid_t gid) {
3754 
3755         key_serial_t keyring;
3756         int r = 0;
3757         uid_t saved_uid;
3758         gid_t saved_gid;
3759 
3760         assert(u);
3761         assert(context);
3762         assert(p);
3763 
3764         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3765          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3766          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3767          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3768          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3769          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3770 
3771         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3772                 return 0;
3773 
3774         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3775          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3776          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3777          * & group is just as nasty as acquiring a reference to the user keyring. */
3778 
3779         saved_uid = getuid();
3780         saved_gid = getgid();
3781 
3782         if (gid_is_valid(gid) && gid != saved_gid) {
3783                 if (setregid(gid, -1) < 0)
3784                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3785         }
3786 
3787         if (uid_is_valid(uid) && uid != saved_uid) {
3788                 if (setreuid(uid, -1) < 0) {
3789                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3790                         goto out;
3791                 }
3792         }
3793 
3794         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3795         if (keyring == -1) {
3796                 if (errno == ENOSYS)
3797                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3798                 else if (ERRNO_IS_PRIVILEGE(errno))
3799                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3800                 else if (errno == EDQUOT)
3801                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3802                 else
3803                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3804 
3805                 goto out;
3806         }
3807 
3808         /* When requested link the user keyring into the session keyring. */
3809         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3810 
3811                 if (keyctl(KEYCTL_LINK,
3812                            KEY_SPEC_USER_KEYRING,
3813                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3814                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3815                         goto out;
3816                 }
3817         }
3818 
3819         /* Restore uid/gid back */
3820         if (uid_is_valid(uid) && uid != saved_uid) {
3821                 if (setreuid(saved_uid, -1) < 0) {
3822                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3823                         goto out;
3824                 }
3825         }
3826 
3827         if (gid_is_valid(gid) && gid != saved_gid) {
3828                 if (setregid(saved_gid, -1) < 0)
3829                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3830         }
3831 
3832         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3833         if (!sd_id128_is_null(u->invocation_id)) {
3834                 key_serial_t key;
3835 
3836                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3837                 if (key == -1)
3838                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3839                 else {
3840                         if (keyctl(KEYCTL_SETPERM, key,
3841                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3842                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3843                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3844                 }
3845         }
3846 
3847 out:
3848         /* Revert back uid & gid for the last time, and exit */
3849         /* no extra logging, as only the first already reported error matters */
3850         if (getuid() != saved_uid)
3851                 (void) setreuid(saved_uid, -1);
3852 
3853         if (getgid() != saved_gid)
3854                 (void) setregid(saved_gid, -1);
3855 
3856         return r;
3857 }
3858 
append_socket_pair(int * array,size_t * n,const int pair[static2])3859 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3860         assert(array);
3861         assert(n);
3862         assert(pair);
3863 
3864         if (pair[0] >= 0)
3865                 array[(*n)++] = pair[0];
3866         if (pair[1] >= 0)
3867                 array[(*n)++] = pair[1];
3868 }
3869 
close_remaining_fds(const ExecParameters * params,const ExecRuntime * runtime,const DynamicCreds * dcreds,int user_lookup_fd,int socket_fd,const int * fds,size_t n_fds)3870 static int close_remaining_fds(
3871                 const ExecParameters *params,
3872                 const ExecRuntime *runtime,
3873                 const DynamicCreds *dcreds,
3874                 int user_lookup_fd,
3875                 int socket_fd,
3876                 const int *fds, size_t n_fds) {
3877 
3878         size_t n_dont_close = 0;
3879         int dont_close[n_fds + 12];
3880 
3881         assert(params);
3882 
3883         if (params->stdin_fd >= 0)
3884                 dont_close[n_dont_close++] = params->stdin_fd;
3885         if (params->stdout_fd >= 0)
3886                 dont_close[n_dont_close++] = params->stdout_fd;
3887         if (params->stderr_fd >= 0)
3888                 dont_close[n_dont_close++] = params->stderr_fd;
3889 
3890         if (socket_fd >= 0)
3891                 dont_close[n_dont_close++] = socket_fd;
3892         if (n_fds > 0) {
3893                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3894                 n_dont_close += n_fds;
3895         }
3896 
3897         if (runtime) {
3898                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3899                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3900         }
3901 
3902         if (dcreds) {
3903                 if (dcreds->user)
3904                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3905                 if (dcreds->group)
3906                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3907         }
3908 
3909         if (user_lookup_fd >= 0)
3910                 dont_close[n_dont_close++] = user_lookup_fd;
3911 
3912         return close_all_fds(dont_close, n_dont_close);
3913 }
3914 
send_user_lookup(Unit * unit,int user_lookup_fd,uid_t uid,gid_t gid)3915 static int send_user_lookup(
3916                 Unit *unit,
3917                 int user_lookup_fd,
3918                 uid_t uid,
3919                 gid_t gid) {
3920 
3921         assert(unit);
3922 
3923         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3924          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3925          * specified. */
3926 
3927         if (user_lookup_fd < 0)
3928                 return 0;
3929 
3930         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3931                 return 0;
3932 
3933         if (writev(user_lookup_fd,
3934                (struct iovec[]) {
3935                            IOVEC_INIT(&uid, sizeof(uid)),
3936                            IOVEC_INIT(&gid, sizeof(gid)),
3937                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3938                 return -errno;
3939 
3940         return 0;
3941 }
3942 
acquire_home(const ExecContext * c,uid_t uid,const char ** home,char ** buf)3943 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3944         int r;
3945 
3946         assert(c);
3947         assert(home);
3948         assert(buf);
3949 
3950         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3951 
3952         if (*home)
3953                 return 0;
3954 
3955         if (!c->working_directory_home)
3956                 return 0;
3957 
3958         r = get_home_dir(buf);
3959         if (r < 0)
3960                 return r;
3961 
3962         *home = *buf;
3963         return 1;
3964 }
3965 
compile_suggested_paths(const ExecContext * c,const ExecParameters * p,char *** ret)3966 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3967         _cleanup_strv_free_ char ** list = NULL;
3968         int r;
3969 
3970         assert(c);
3971         assert(p);
3972         assert(ret);
3973 
3974         assert(c->dynamic_user);
3975 
3976         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3977          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3978          * directories. */
3979 
3980         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3981                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3982                         continue;
3983 
3984                 if (!p->prefix[t])
3985                         continue;
3986 
3987                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3988                         char *e;
3989 
3990                         if (exec_directory_is_private(c, t))
3991                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3992                         else
3993                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3994                         if (!e)
3995                                 return -ENOMEM;
3996 
3997                         r = strv_consume(&list, e);
3998                         if (r < 0)
3999                                 return r;
4000                 }
4001         }
4002 
4003         *ret = TAKE_PTR(list);
4004 
4005         return 0;
4006 }
4007 
exec_parameters_get_cgroup_path(const ExecParameters * params,char ** ret)4008 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4009         bool using_subcgroup;
4010         char *p;
4011 
4012         assert(params);
4013         assert(ret);
4014 
4015         if (!params->cgroup_path)
4016                 return -EINVAL;
4017 
4018         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4019          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4020          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4021          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4022          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4023          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4024          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4025          * flag, which is only passed for the former statements, not for the latter. */
4026 
4027         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4028         if (using_subcgroup)
4029                 p = path_join(params->cgroup_path, ".control");
4030         else
4031                 p = strdup(params->cgroup_path);
4032         if (!p)
4033                 return -ENOMEM;
4034 
4035         *ret = p;
4036         return using_subcgroup;
4037 }
4038 
exec_context_cpu_affinity_from_numa(const ExecContext * c,CPUSet * ret)4039 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4040         _cleanup_(cpu_set_reset) CPUSet s = {};
4041         int r;
4042 
4043         assert(c);
4044         assert(ret);
4045 
4046         if (!c->numa_policy.nodes.set) {
4047                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4048                 return 0;
4049         }
4050 
4051         r = numa_to_cpu_set(&c->numa_policy, &s);
4052         if (r < 0)
4053                 return r;
4054 
4055         cpu_set_reset(ret);
4056 
4057         return cpu_set_add_all(ret, &s);
4058 }
4059 
exec_context_get_cpu_affinity_from_numa(const ExecContext * c)4060 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4061         assert(c);
4062 
4063         return c->cpu_affinity_from_numa;
4064 }
4065 
add_shifted_fd(int * fds,size_t fds_size,size_t * n_fds,int fd,int * ret_fd)4066 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4067         int r;
4068 
4069         assert(fds);
4070         assert(n_fds);
4071         assert(*n_fds < fds_size);
4072         assert(ret_fd);
4073 
4074         if (fd < 0) {
4075                 *ret_fd = -1;
4076                 return 0;
4077         }
4078 
4079         if (fd < 3 + (int) *n_fds) {
4080                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4081                  * the fds we pass to the process (or which are closed only during execve). */
4082 
4083                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4084                 if (r < 0)
4085                         return -errno;
4086 
4087                 CLOSE_AND_REPLACE(fd, r);
4088         }
4089 
4090         *ret_fd = fds[*n_fds] = fd;
4091         (*n_fds) ++;
4092         return 1;
4093 }
4094 
exec_child(Unit * unit,const ExecCommand * command,const ExecContext * context,const ExecParameters * params,ExecRuntime * runtime,DynamicCreds * dcreds,int socket_fd,const int named_iofds[static3],int * fds,size_t n_socket_fds,size_t n_storage_fds,char ** files_env,int user_lookup_fd,int * exit_status)4095 static int exec_child(
4096                 Unit *unit,
4097                 const ExecCommand *command,
4098                 const ExecContext *context,
4099                 const ExecParameters *params,
4100                 ExecRuntime *runtime,
4101                 DynamicCreds *dcreds,
4102                 int socket_fd,
4103                 const int named_iofds[static 3],
4104                 int *fds,
4105                 size_t n_socket_fds,
4106                 size_t n_storage_fds,
4107                 char **files_env,
4108                 int user_lookup_fd,
4109                 int *exit_status) {
4110 
4111         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4112         int r, ngids = 0, exec_fd;
4113         _cleanup_free_ gid_t *supplementary_gids = NULL;
4114         const char *username = NULL, *groupname = NULL;
4115         _cleanup_free_ char *home_buffer = NULL;
4116         const char *home = NULL, *shell = NULL;
4117         char **final_argv = NULL;
4118         dev_t journal_stream_dev = 0;
4119         ino_t journal_stream_ino = 0;
4120         bool userns_set_up = false;
4121         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4122                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4123                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4124                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4125 #if HAVE_SELINUX
4126         _cleanup_free_ char *mac_selinux_context_net = NULL;
4127         bool use_selinux = false;
4128 #endif
4129 #if ENABLE_SMACK
4130         bool use_smack = false;
4131 #endif
4132 #if HAVE_APPARMOR
4133         bool use_apparmor = false;
4134 #endif
4135         uid_t saved_uid = getuid();
4136         gid_t saved_gid = getgid();
4137         uid_t uid = UID_INVALID;
4138         gid_t gid = GID_INVALID;
4139         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4140                n_keep_fds; /* total number of fds not to close */
4141         int secure_bits;
4142         _cleanup_free_ gid_t *gids_after_pam = NULL;
4143         int ngids_after_pam = 0;
4144 
4145         assert(unit);
4146         assert(command);
4147         assert(context);
4148         assert(params);
4149         assert(exit_status);
4150 
4151         /* Explicitly test for CVE-2021-4034 inspired invocations */
4152         assert(command->path);
4153         assert(!strv_isempty(command->argv));
4154 
4155         rename_process_from_path(command->path);
4156 
4157         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4158          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4159          * both of which will be demoted to SIG_DFL. */
4160         (void) default_signals(SIGNALS_CRASH_HANDLER,
4161                                SIGNALS_IGNORE);
4162 
4163         if (context->ignore_sigpipe)
4164                 (void) ignore_signals(SIGPIPE);
4165 
4166         r = reset_signal_mask();
4167         if (r < 0) {
4168                 *exit_status = EXIT_SIGNAL_MASK;
4169                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4170         }
4171 
4172         if (params->idle_pipe)
4173                 do_idle_pipe_dance(params->idle_pipe);
4174 
4175         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4176          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4177          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4178          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4179 
4180         log_forget_fds();
4181         log_set_open_when_needed(true);
4182 
4183         /* In case anything used libc syslog(), close this here, too */
4184         closelog();
4185 
4186         int keep_fds[n_fds + 3];
4187         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4188         n_keep_fds = n_fds;
4189 
4190         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4191         if (r < 0) {
4192                 *exit_status = EXIT_FDS;
4193                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4194         }
4195 
4196 #if HAVE_LIBBPF
4197         if (unit->manager->restrict_fs) {
4198                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4199                 if (bpf_map_fd < 0) {
4200                         *exit_status = EXIT_FDS;
4201                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4202                 }
4203 
4204                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4205                 if (r < 0) {
4206                         *exit_status = EXIT_FDS;
4207                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4208                 }
4209         }
4210 #endif
4211 
4212         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4213         if (r < 0) {
4214                 *exit_status = EXIT_FDS;
4215                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4216         }
4217 
4218         if (!context->same_pgrp &&
4219             setsid() < 0) {
4220                 *exit_status = EXIT_SETSID;
4221                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4222         }
4223 
4224         exec_context_tty_reset(context, params);
4225 
4226         if (unit_shall_confirm_spawn(unit)) {
4227                 _cleanup_free_ char *cmdline = NULL;
4228 
4229                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4230                 if (!cmdline) {
4231                         *exit_status = EXIT_MEMORY;
4232                         return log_oom();
4233                 }
4234 
4235                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4236                 if (r != CONFIRM_EXECUTE) {
4237                         if (r == CONFIRM_PRETEND_SUCCESS) {
4238                                 *exit_status = EXIT_SUCCESS;
4239                                 return 0;
4240                         }
4241                         *exit_status = EXIT_CONFIRM;
4242                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4243                                                     "Execution cancelled by the user");
4244                 }
4245         }
4246 
4247         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4248          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4249          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4250          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4251          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4252         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4253             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4254                 *exit_status = EXIT_MEMORY;
4255                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4256         }
4257 
4258         if (context->dynamic_user && dcreds) {
4259                 _cleanup_strv_free_ char **suggested_paths = NULL;
4260 
4261                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4262                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4263                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4264                         *exit_status = EXIT_USER;
4265                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4266                 }
4267 
4268                 r = compile_suggested_paths(context, params, &suggested_paths);
4269                 if (r < 0) {
4270                         *exit_status = EXIT_MEMORY;
4271                         return log_oom();
4272                 }
4273 
4274                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4275                 if (r < 0) {
4276                         *exit_status = EXIT_USER;
4277                         if (r == -EILSEQ)
4278                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4279                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4280                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4281                 }
4282 
4283                 if (!uid_is_valid(uid)) {
4284                         *exit_status = EXIT_USER;
4285                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4286                 }
4287 
4288                 if (!gid_is_valid(gid)) {
4289                         *exit_status = EXIT_USER;
4290                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4291                 }
4292 
4293                 if (dcreds->user)
4294                         username = dcreds->user->name;
4295 
4296         } else {
4297                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4298                 if (r < 0) {
4299                         *exit_status = EXIT_USER;
4300                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4301                 }
4302 
4303                 r = get_fixed_group(context, &groupname, &gid);
4304                 if (r < 0) {
4305                         *exit_status = EXIT_GROUP;
4306                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4307                 }
4308         }
4309 
4310         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4311         r = get_supplementary_groups(context, username, groupname, gid,
4312                                      &supplementary_gids, &ngids);
4313         if (r < 0) {
4314                 *exit_status = EXIT_GROUP;
4315                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4316         }
4317 
4318         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4319         if (r < 0) {
4320                 *exit_status = EXIT_USER;
4321                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4322         }
4323 
4324         user_lookup_fd = safe_close(user_lookup_fd);
4325 
4326         r = acquire_home(context, uid, &home, &home_buffer);
4327         if (r < 0) {
4328                 *exit_status = EXIT_CHDIR;
4329                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4330         }
4331 
4332         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4333          * must sure to drop O_NONBLOCK */
4334         if (socket_fd >= 0)
4335                 (void) fd_nonblock(socket_fd, false);
4336 
4337         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4338          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4339         if (params->cgroup_path) {
4340                 _cleanup_free_ char *p = NULL;
4341 
4342                 r = exec_parameters_get_cgroup_path(params, &p);
4343                 if (r < 0) {
4344                         *exit_status = EXIT_CGROUP;
4345                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4346                 }
4347 
4348                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4349                 if (r == -EUCLEAN) {
4350                         *exit_status = EXIT_CGROUP;
4351                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4352                                                     "because the cgroup or one of its parents or "
4353                                                     "siblings is in the threaded mode: %m", p);
4354                 }
4355                 if (r < 0) {
4356                         *exit_status = EXIT_CGROUP;
4357                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4358                 }
4359         }
4360 
4361         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4362                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4363                 if (r < 0) {
4364                         *exit_status = EXIT_NETWORK;
4365                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4366                 }
4367         }
4368 
4369         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4370                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4371                 if (r < 0) {
4372                         *exit_status = EXIT_NAMESPACE;
4373                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4374                 }
4375         }
4376 
4377         r = setup_input(context, params, socket_fd, named_iofds);
4378         if (r < 0) {
4379                 *exit_status = EXIT_STDIN;
4380                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4381         }
4382 
4383         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4384         if (r < 0) {
4385                 *exit_status = EXIT_STDOUT;
4386                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4387         }
4388 
4389         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4390         if (r < 0) {
4391                 *exit_status = EXIT_STDERR;
4392                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4393         }
4394 
4395         if (context->oom_score_adjust_set) {
4396                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4397                  * prohibit write access to this file, and we shouldn't trip up over that. */
4398                 r = set_oom_score_adjust(context->oom_score_adjust);
4399                 if (ERRNO_IS_PRIVILEGE(r))
4400                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4401                 else if (r < 0) {
4402                         *exit_status = EXIT_OOM_ADJUST;
4403                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4404                 }
4405         }
4406 
4407         if (context->coredump_filter_set) {
4408                 r = set_coredump_filter(context->coredump_filter);
4409                 if (ERRNO_IS_PRIVILEGE(r))
4410                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4411                 else if (r < 0)
4412                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4413         }
4414 
4415         if (context->nice_set) {
4416                 r = setpriority_closest(context->nice);
4417                 if (r < 0)
4418                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4419         }
4420 
4421         if (context->cpu_sched_set) {
4422                 struct sched_param param = {
4423                         .sched_priority = context->cpu_sched_priority,
4424                 };
4425 
4426                 r = sched_setscheduler(0,
4427                                        context->cpu_sched_policy |
4428                                        (context->cpu_sched_reset_on_fork ?
4429                                         SCHED_RESET_ON_FORK : 0),
4430                                        &param);
4431                 if (r < 0) {
4432                         *exit_status = EXIT_SETSCHEDULER;
4433                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4434                 }
4435         }
4436 
4437         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4438                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4439                 const CPUSet *cpu_set;
4440 
4441                 if (context->cpu_affinity_from_numa) {
4442                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4443                         if (r < 0) {
4444                                 *exit_status = EXIT_CPUAFFINITY;
4445                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4446                         }
4447 
4448                         cpu_set = &converted_cpu_set;
4449                 } else
4450                         cpu_set = &context->cpu_set;
4451 
4452                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4453                         *exit_status = EXIT_CPUAFFINITY;
4454                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4455                 }
4456         }
4457 
4458         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4459                 r = apply_numa_policy(&context->numa_policy);
4460                 if (r == -EOPNOTSUPP)
4461                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4462                 else if (r < 0) {
4463                         *exit_status = EXIT_NUMA_POLICY;
4464                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4465                 }
4466         }
4467 
4468         if (context->ioprio_set)
4469                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4470                         *exit_status = EXIT_IOPRIO;
4471                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4472                 }
4473 
4474         if (context->timer_slack_nsec != NSEC_INFINITY)
4475                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4476                         *exit_status = EXIT_TIMERSLACK;
4477                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4478                 }
4479 
4480         if (context->personality != PERSONALITY_INVALID) {
4481                 r = safe_personality(context->personality);
4482                 if (r < 0) {
4483                         *exit_status = EXIT_PERSONALITY;
4484                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4485                 }
4486         }
4487 
4488         if (context->utmp_id) {
4489                 const char *line = context->tty_path ?
4490                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4491                         NULL;
4492                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4493                                       line,
4494                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4495                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4496                                       USER_PROCESS,
4497                                       username);
4498         }
4499 
4500         if (uid_is_valid(uid)) {
4501                 r = chown_terminal(STDIN_FILENO, uid);
4502                 if (r < 0) {
4503                         *exit_status = EXIT_STDIN;
4504                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4505                 }
4506         }
4507 
4508         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4509          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4510          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4511          * touch a single hierarchy too. */
4512         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4513                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4514                 if (r < 0) {
4515                         *exit_status = EXIT_CGROUP;
4516                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4517                 }
4518         }
4519 
4520         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4521 
4522         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4523                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4524                 if (r < 0)
4525                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4526         }
4527 
4528         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4529                 r = setup_credentials(context, params, unit->id, uid);
4530                 if (r < 0) {
4531                         *exit_status = EXIT_CREDENTIALS;
4532                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4533                 }
4534         }
4535 
4536         r = build_environment(
4537                         unit,
4538                         context,
4539                         params,
4540                         n_fds,
4541                         home,
4542                         username,
4543                         shell,
4544                         journal_stream_dev,
4545                         journal_stream_ino,
4546                         &our_env);
4547         if (r < 0) {
4548                 *exit_status = EXIT_MEMORY;
4549                 return log_oom();
4550         }
4551 
4552         r = build_pass_environment(context, &pass_env);
4553         if (r < 0) {
4554                 *exit_status = EXIT_MEMORY;
4555                 return log_oom();
4556         }
4557 
4558         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4559          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4560          * not specify PATH but the unit has ExecSearchPath. */
4561         if (!strv_isempty(context->exec_search_path)) {
4562                 _cleanup_free_ char *joined = NULL;
4563 
4564                 joined = strv_join(context->exec_search_path, ":");
4565                 if (!joined) {
4566                         *exit_status = EXIT_MEMORY;
4567                         return log_oom();
4568                 }
4569 
4570                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4571                 if (r < 0) {
4572                         *exit_status = EXIT_MEMORY;
4573                         return log_oom();
4574                 }
4575         }
4576 
4577         accum_env = strv_env_merge(params->environment,
4578                                    our_env,
4579                                    joined_exec_search_path,
4580                                    pass_env,
4581                                    context->environment,
4582                                    files_env);
4583         if (!accum_env) {
4584                 *exit_status = EXIT_MEMORY;
4585                 return log_oom();
4586         }
4587         accum_env = strv_env_clean(accum_env);
4588 
4589         (void) umask(context->umask);
4590 
4591         r = setup_keyring(unit, context, params, uid, gid);
4592         if (r < 0) {
4593                 *exit_status = EXIT_KEYRING;
4594                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4595         }
4596 
4597         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4598          * from it. */
4599         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4600 
4601         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4602          * for it, and the kernel doesn't actually support ambient caps. */
4603         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4604 
4605         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4606          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4607          * desired. */
4608         if (needs_ambient_hack)
4609                 needs_setuid = false;
4610         else
4611                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4612 
4613         if (needs_sandboxing) {
4614                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4615                  * /sys being present. The actual MAC context application will happen later, as late as
4616                  * possible, to avoid impacting our own code paths. */
4617 
4618 #if HAVE_SELINUX
4619                 use_selinux = mac_selinux_use();
4620 #endif
4621 #if ENABLE_SMACK
4622                 use_smack = mac_smack_use();
4623 #endif
4624 #if HAVE_APPARMOR
4625                 use_apparmor = mac_apparmor_use();
4626 #endif
4627         }
4628 
4629         if (needs_sandboxing) {
4630                 int which_failed;
4631 
4632                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4633                  * is set here. (See below.) */
4634 
4635                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4636                 if (r < 0) {
4637                         *exit_status = EXIT_LIMITS;
4638                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4639                 }
4640         }
4641 
4642         if (needs_setuid && context->pam_name && username) {
4643                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4644                  * wins here. (See above.) */
4645 
4646                 /* All fds passed in the fds array will be closed in the pam child process. */
4647                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4648                 if (r < 0) {
4649                         *exit_status = EXIT_PAM;
4650                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4651                 }
4652 
4653                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4654                 if (ngids_after_pam < 0) {
4655                         *exit_status = EXIT_MEMORY;
4656                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4657                 }
4658         }
4659 
4660         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4661                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4662                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4663                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4664 
4665                 userns_set_up = true;
4666                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4667                 if (r < 0) {
4668                         *exit_status = EXIT_USER;
4669                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4670                 }
4671         }
4672 
4673         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4674 
4675                 if (ns_type_supported(NAMESPACE_NET)) {
4676                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4677                         if (r == -EPERM)
4678                                 log_unit_warning_errno(unit, r,
4679                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4680                         else if (r < 0) {
4681                                 *exit_status = EXIT_NETWORK;
4682                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4683                         }
4684                 } else if (context->network_namespace_path) {
4685                         *exit_status = EXIT_NETWORK;
4686                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4687                                                     "NetworkNamespacePath= is not supported, refusing.");
4688                 } else
4689                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4690         }
4691 
4692         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4693 
4694                 if (ns_type_supported(NAMESPACE_IPC)) {
4695                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4696                         if (r == -EPERM)
4697                                 log_unit_warning_errno(unit, r,
4698                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4699                         else if (r < 0) {
4700                                 *exit_status = EXIT_NAMESPACE;
4701                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4702                         }
4703                 } else if (context->ipc_namespace_path) {
4704                         *exit_status = EXIT_NAMESPACE;
4705                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4706                                                     "IPCNamespacePath= is not supported, refusing.");
4707                 } else
4708                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4709         }
4710 
4711         if (needs_mount_namespace) {
4712                 _cleanup_free_ char *error_path = NULL;
4713 
4714                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4715                 if (r < 0) {
4716                         *exit_status = EXIT_NAMESPACE;
4717                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4718                                                     error_path ? ": " : "", strempty(error_path));
4719                 }
4720         }
4721 
4722         if (needs_sandboxing) {
4723                 r = apply_protect_hostname(unit, context, exit_status);
4724                 if (r < 0)
4725                         return r;
4726         }
4727 
4728         /* Drop groups as early as possible.
4729          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4730          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4731         if (needs_setuid) {
4732                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4733                 int ngids_to_enforce = 0;
4734 
4735                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4736                                                    ngids,
4737                                                    gids_after_pam,
4738                                                    ngids_after_pam,
4739                                                    &gids_to_enforce);
4740                 if (ngids_to_enforce < 0) {
4741                         *exit_status = EXIT_MEMORY;
4742                         return log_unit_error_errno(unit,
4743                                                     ngids_to_enforce,
4744                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4745                 }
4746 
4747                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4748                 if (r < 0) {
4749                         *exit_status = EXIT_GROUP;
4750                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4751                 }
4752         }
4753 
4754         /* If the user namespace was not set up above, try to do it now.
4755          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4756          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4757          * case of mount namespaces being less privileged when the mount point list is copied from a
4758          * different user namespace). */
4759 
4760         if (needs_sandboxing && context->private_users && !userns_set_up) {
4761                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4762                 if (r < 0) {
4763                         *exit_status = EXIT_USER;
4764                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4765                 }
4766         }
4767 
4768         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4769          * shall execute. */
4770 
4771         _cleanup_free_ char *executable = NULL;
4772         _cleanup_close_ int executable_fd = -1;
4773         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4774         if (r < 0) {
4775                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4776                         log_unit_struct_errno(unit, LOG_INFO, r,
4777                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4778                                               LOG_UNIT_INVOCATION_ID(unit),
4779                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4780                                                                command->path),
4781                                               "EXECUTABLE=%s", command->path);
4782                         return 0;
4783                 }
4784 
4785                 *exit_status = EXIT_EXEC;
4786 
4787                 return log_unit_struct_errno(unit, LOG_INFO, r,
4788                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4789                                              LOG_UNIT_INVOCATION_ID(unit),
4790                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4791                                                               command->path),
4792                                              "EXECUTABLE=%s", command->path);
4793         }
4794 
4795         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4796         if (r < 0) {
4797                 *exit_status = EXIT_FDS;
4798                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4799         }
4800 
4801 #if HAVE_SELINUX
4802         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4803                 int fd = -1;
4804 
4805                 if (socket_fd >= 0)
4806                         fd = socket_fd;
4807                 else if (params->n_socket_fds == 1)
4808                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4809                          * use context from that fd to compute the label. */
4810                         fd = params->fds[0];
4811 
4812                 if (fd >= 0) {
4813                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4814                         if (r < 0) {
4815                                 if (!context->selinux_context_ignore) {
4816                                         *exit_status = EXIT_SELINUX_CONTEXT;
4817                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4818                                 }
4819                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4820                         }
4821                 }
4822         }
4823 #endif
4824 
4825         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4826          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4827          * however if we have it as we want to keep it open until the final execve(). */
4828 
4829         r = close_all_fds(keep_fds, n_keep_fds);
4830         if (r >= 0)
4831                 r = shift_fds(fds, n_fds);
4832         if (r >= 0)
4833                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4834         if (r < 0) {
4835                 *exit_status = EXIT_FDS;
4836                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4837         }
4838 
4839         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4840          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4841          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4842          * came this far. */
4843 
4844         secure_bits = context->secure_bits;
4845 
4846         if (needs_sandboxing) {
4847                 uint64_t bset;
4848 
4849                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4850                  * requested. (Note this is placed after the general resource limit initialization, see
4851                  * above, in order to take precedence.) */
4852                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4853                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4854                                 *exit_status = EXIT_LIMITS;
4855                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4856                         }
4857                 }
4858 
4859 #if ENABLE_SMACK
4860                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4861                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4862                 if (use_smack) {
4863                         r = setup_smack(context, executable_fd);
4864                         if (r < 0 && !context->smack_process_label_ignore) {
4865                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4866                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4867                         }
4868                 }
4869 #endif
4870 
4871                 bset = context->capability_bounding_set;
4872                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4873                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4874                  * instead of us doing that */
4875                 if (needs_ambient_hack)
4876                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4877                                 (UINT64_C(1) << CAP_SETUID) |
4878                                 (UINT64_C(1) << CAP_SETGID);
4879 
4880                 if (!cap_test_all(bset)) {
4881                         r = capability_bounding_set_drop(bset, false);
4882                         if (r < 0) {
4883                                 *exit_status = EXIT_CAPABILITIES;
4884                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4885                         }
4886                 }
4887 
4888                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4889                  * keep-caps set.
4890                  * To be able to raise the ambient capabilities after setresuid() they have to be
4891                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4892                  * After setresuid() the ambient capabilities can be raised as they are present in
4893                  * the permitted and inhertiable set. However it is possible that someone wants to
4894                  * set ambient capabilities without changing the user, so we also set the ambient
4895                  * capabilities here.
4896                  * The requested ambient capabilities are raised in the inheritable set if the
4897                  * second argument is true. */
4898                 if (!needs_ambient_hack) {
4899                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4900                         if (r < 0) {
4901                                 *exit_status = EXIT_CAPABILITIES;
4902                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4903                         }
4904                 }
4905         }
4906 
4907         /* chroot to root directory first, before we lose the ability to chroot */
4908         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4909         if (r < 0)
4910                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4911 
4912         if (needs_setuid) {
4913                 if (uid_is_valid(uid)) {
4914                         r = enforce_user(context, uid);
4915                         if (r < 0) {
4916                                 *exit_status = EXIT_USER;
4917                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4918                         }
4919 
4920                         if (!needs_ambient_hack &&
4921                             context->capability_ambient_set != 0) {
4922 
4923                                 /* Raise the ambient capabilities after user change. */
4924                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4925                                 if (r < 0) {
4926                                         *exit_status = EXIT_CAPABILITIES;
4927                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4928                                 }
4929                         }
4930                 }
4931         }
4932 
4933         /* Apply working directory here, because the working directory might be on NFS and only the user running
4934          * this service might have the correct privilege to change to the working directory */
4935         r = apply_working_directory(context, params, home, exit_status);
4936         if (r < 0)
4937                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4938 
4939         if (needs_sandboxing) {
4940                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4941                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4942                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4943                  * are restricted. */
4944 
4945 #if HAVE_SELINUX
4946                 if (use_selinux) {
4947                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4948 
4949                         if (exec_context) {
4950                                 r = setexeccon(exec_context);
4951                                 if (r < 0) {
4952                                         if (!context->selinux_context_ignore) {
4953                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4954                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4955                                         }
4956                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4957                                 }
4958                         }
4959                 }
4960 #endif
4961 
4962 #if HAVE_APPARMOR
4963                 if (use_apparmor && context->apparmor_profile) {
4964                         r = aa_change_onexec(context->apparmor_profile);
4965                         if (r < 0 && !context->apparmor_profile_ignore) {
4966                                 *exit_status = EXIT_APPARMOR_PROFILE;
4967                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4968                         }
4969                 }
4970 #endif
4971 
4972                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4973                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4974                  * CAP_SETPCAP. */
4975                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4976                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4977                          * effective set here.
4978                          * The effective set is overwritten during execve  with the following  values:
4979                          * - ambient set (for non-root processes)
4980                          * - (inheritable | bounding) set for root processes)
4981                          *
4982                          * Hence there is no security impact to raise it in the effective set before execve
4983                          */
4984                         r = capability_gain_cap_setpcap(NULL);
4985                         if (r < 0) {
4986                                 *exit_status = EXIT_CAPABILITIES;
4987                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4988                         }
4989                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4990                                 *exit_status = EXIT_SECUREBITS;
4991                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4992                         }
4993                 }
4994 
4995                 if (context_has_no_new_privileges(context))
4996                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4997                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4998                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4999                         }
5000 
5001 #if HAVE_SECCOMP
5002                 r = apply_address_families(unit, context);
5003                 if (r < 0) {
5004                         *exit_status = EXIT_ADDRESS_FAMILIES;
5005                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5006                 }
5007 
5008                 r = apply_memory_deny_write_execute(unit, context);
5009                 if (r < 0) {
5010                         *exit_status = EXIT_SECCOMP;
5011                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5012                 }
5013 
5014                 r = apply_restrict_realtime(unit, context);
5015                 if (r < 0) {
5016                         *exit_status = EXIT_SECCOMP;
5017                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5018                 }
5019 
5020                 r = apply_restrict_suid_sgid(unit, context);
5021                 if (r < 0) {
5022                         *exit_status = EXIT_SECCOMP;
5023                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5024                 }
5025 
5026                 r = apply_restrict_namespaces(unit, context);
5027                 if (r < 0) {
5028                         *exit_status = EXIT_SECCOMP;
5029                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5030                 }
5031 
5032                 r = apply_protect_sysctl(unit, context);
5033                 if (r < 0) {
5034                         *exit_status = EXIT_SECCOMP;
5035                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5036                 }
5037 
5038                 r = apply_protect_kernel_modules(unit, context);
5039                 if (r < 0) {
5040                         *exit_status = EXIT_SECCOMP;
5041                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5042                 }
5043 
5044                 r = apply_protect_kernel_logs(unit, context);
5045                 if (r < 0) {
5046                         *exit_status = EXIT_SECCOMP;
5047                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5048                 }
5049 
5050                 r = apply_protect_clock(unit, context);
5051                 if (r < 0) {
5052                         *exit_status = EXIT_SECCOMP;
5053                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5054                 }
5055 
5056                 r = apply_private_devices(unit, context);
5057                 if (r < 0) {
5058                         *exit_status = EXIT_SECCOMP;
5059                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5060                 }
5061 
5062                 r = apply_syscall_archs(unit, context);
5063                 if (r < 0) {
5064                         *exit_status = EXIT_SECCOMP;
5065                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5066                 }
5067 
5068                 r = apply_lock_personality(unit, context);
5069                 if (r < 0) {
5070                         *exit_status = EXIT_SECCOMP;
5071                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5072                 }
5073 
5074                 r = apply_syscall_log(unit, context);
5075                 if (r < 0) {
5076                         *exit_status = EXIT_SECCOMP;
5077                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5078                 }
5079 
5080                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5081                  * by the filter as little as possible. */
5082                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5083                 if (r < 0) {
5084                         *exit_status = EXIT_SECCOMP;
5085                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5086                 }
5087 #endif
5088 
5089 #if HAVE_LIBBPF
5090                 r = apply_restrict_filesystems(unit, context);
5091                 if (r < 0) {
5092                         *exit_status = EXIT_BPF;
5093                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5094                 }
5095 #endif
5096 
5097         }
5098 
5099         if (!strv_isempty(context->unset_environment)) {
5100                 char **ee = NULL;
5101 
5102                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5103                 if (!ee) {
5104                         *exit_status = EXIT_MEMORY;
5105                         return log_oom();
5106                 }
5107 
5108                 strv_free_and_replace(accum_env, ee);
5109         }
5110 
5111         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5112                 replaced_argv = replace_env_argv(command->argv, accum_env);
5113                 if (!replaced_argv) {
5114                         *exit_status = EXIT_MEMORY;
5115                         return log_oom();
5116                 }
5117                 final_argv = replaced_argv;
5118         } else
5119                 final_argv = command->argv;
5120 
5121         if (DEBUG_LOGGING) {
5122                 _cleanup_free_ char *line = NULL;
5123 
5124                 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5125                 if (!line) {
5126                         *exit_status = EXIT_MEMORY;
5127                         return log_oom();
5128                 }
5129 
5130                 log_unit_struct(unit, LOG_DEBUG,
5131                                 "EXECUTABLE=%s", executable,
5132                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5133         }
5134 
5135         if (exec_fd >= 0) {
5136                 uint8_t hot = 1;
5137 
5138                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5139                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5140 
5141                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5142                         *exit_status = EXIT_EXEC;
5143                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5144                 }
5145         }
5146 
5147         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5148 
5149         if (exec_fd >= 0) {
5150                 uint8_t hot = 0;
5151 
5152                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5153                  * that POLLHUP on it no longer means execve() succeeded. */
5154 
5155                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5156                         *exit_status = EXIT_EXEC;
5157                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5158                 }
5159         }
5160 
5161         *exit_status = EXIT_EXEC;
5162         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5163 }
5164 
5165 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5166 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5167 
exec_spawn(Unit * unit,ExecCommand * command,const ExecContext * context,const ExecParameters * params,ExecRuntime * runtime,DynamicCreds * dcreds,pid_t * ret)5168 int exec_spawn(Unit *unit,
5169                ExecCommand *command,
5170                const ExecContext *context,
5171                const ExecParameters *params,
5172                ExecRuntime *runtime,
5173                DynamicCreds *dcreds,
5174                pid_t *ret) {
5175 
5176         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5177         _cleanup_free_ char *subcgroup_path = NULL;
5178         _cleanup_strv_free_ char **files_env = NULL;
5179         size_t n_storage_fds = 0, n_socket_fds = 0;
5180         _cleanup_free_ char *line = NULL;
5181         pid_t pid;
5182 
5183         assert(unit);
5184         assert(command);
5185         assert(context);
5186         assert(ret);
5187         assert(params);
5188         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5189 
5190         if (context->std_input == EXEC_INPUT_SOCKET ||
5191             context->std_output == EXEC_OUTPUT_SOCKET ||
5192             context->std_error == EXEC_OUTPUT_SOCKET) {
5193 
5194                 if (params->n_socket_fds > 1)
5195                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5196 
5197                 if (params->n_socket_fds == 0)
5198                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5199 
5200                 socket_fd = params->fds[0];
5201         } else {
5202                 socket_fd = -1;
5203                 fds = params->fds;
5204                 n_socket_fds = params->n_socket_fds;
5205                 n_storage_fds = params->n_storage_fds;
5206         }
5207 
5208         r = exec_context_named_iofds(context, params, named_iofds);
5209         if (r < 0)
5210                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5211 
5212         r = exec_context_load_environment(unit, context, &files_env);
5213         if (r < 0)
5214                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5215 
5216         line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5217         if (!line)
5218                 return log_oom();
5219 
5220         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5221            and, until the next SELinux policy changes, we save further reloads in future children. */
5222         mac_selinux_maybe_reload();
5223 
5224         log_unit_struct(unit, LOG_DEBUG,
5225                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5226                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5227                                                            the mount namespace in the child, but we want to log
5228                                                            from the parent, so we need to use the (possibly
5229                                                            inaccurate) path here. */
5230                         LOG_UNIT_INVOCATION_ID(unit));
5231 
5232         if (params->cgroup_path) {
5233                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5234                 if (r < 0)
5235                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5236                 if (r > 0) { /* We are using a child cgroup */
5237                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5238                         if (r < 0)
5239                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5240 
5241                         /* Normally we would not propagate the oomd xattrs to children but since we created this
5242                          * sub-cgroup internally we should do it. */
5243                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5244                 }
5245         }
5246 
5247         pid = fork();
5248         if (pid < 0)
5249                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5250 
5251         if (pid == 0) {
5252                 int exit_status = EXIT_SUCCESS;
5253 
5254                 r = exec_child(unit,
5255                                command,
5256                                context,
5257                                params,
5258                                runtime,
5259                                dcreds,
5260                                socket_fd,
5261                                named_iofds,
5262                                fds,
5263                                n_socket_fds,
5264                                n_storage_fds,
5265                                files_env,
5266                                unit->manager->user_lookup_fds[1],
5267                                &exit_status);
5268 
5269                 if (r < 0) {
5270                         const char *status =
5271                                 exit_status_to_string(exit_status,
5272                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5273 
5274                         log_unit_struct_errno(unit, LOG_ERR, r,
5275                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5276                                               LOG_UNIT_INVOCATION_ID(unit),
5277                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5278                                                                status, command->path),
5279                                               "EXECUTABLE=%s", command->path);
5280                 }
5281 
5282                 _exit(exit_status);
5283         }
5284 
5285         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5286 
5287         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5288          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5289          * process will be killed too). */
5290         if (subcgroup_path)
5291                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5292 
5293         exec_status_start(&command->exec_status, pid);
5294 
5295         *ret = pid;
5296         return 0;
5297 }
5298 
exec_context_init(ExecContext * c)5299 void exec_context_init(ExecContext *c) {
5300         assert(c);
5301 
5302         c->umask = 0022;
5303         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5304         c->cpu_sched_policy = SCHED_OTHER;
5305         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5306         c->syslog_level_prefix = true;
5307         c->ignore_sigpipe = true;
5308         c->timer_slack_nsec = NSEC_INFINITY;
5309         c->personality = PERSONALITY_INVALID;
5310         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5311                 c->directories[t].mode = 0755;
5312         c->timeout_clean_usec = USEC_INFINITY;
5313         c->capability_bounding_set = CAP_ALL;
5314         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5315         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5316         c->log_level_max = -1;
5317 #if HAVE_SECCOMP
5318         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5319 #endif
5320         c->tty_rows = UINT_MAX;
5321         c->tty_cols = UINT_MAX;
5322         numa_policy_reset(&c->numa_policy);
5323 }
5324 
exec_context_done(ExecContext * c)5325 void exec_context_done(ExecContext *c) {
5326         assert(c);
5327 
5328         c->environment = strv_free(c->environment);
5329         c->environment_files = strv_free(c->environment_files);
5330         c->pass_environment = strv_free(c->pass_environment);
5331         c->unset_environment = strv_free(c->unset_environment);
5332 
5333         rlimit_free_all(c->rlimit);
5334 
5335         for (size_t l = 0; l < 3; l++) {
5336                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5337                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5338         }
5339 
5340         c->working_directory = mfree(c->working_directory);
5341         c->root_directory = mfree(c->root_directory);
5342         c->root_image = mfree(c->root_image);
5343         c->root_image_options = mount_options_free_all(c->root_image_options);
5344         c->root_hash = mfree(c->root_hash);
5345         c->root_hash_size = 0;
5346         c->root_hash_path = mfree(c->root_hash_path);
5347         c->root_hash_sig = mfree(c->root_hash_sig);
5348         c->root_hash_sig_size = 0;
5349         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5350         c->root_verity = mfree(c->root_verity);
5351         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5352         c->extension_directories = strv_free(c->extension_directories);
5353         c->tty_path = mfree(c->tty_path);
5354         c->syslog_identifier = mfree(c->syslog_identifier);
5355         c->user = mfree(c->user);
5356         c->group = mfree(c->group);
5357 
5358         c->supplementary_groups = strv_free(c->supplementary_groups);
5359 
5360         c->pam_name = mfree(c->pam_name);
5361 
5362         c->read_only_paths = strv_free(c->read_only_paths);
5363         c->read_write_paths = strv_free(c->read_write_paths);
5364         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5365         c->exec_paths = strv_free(c->exec_paths);
5366         c->no_exec_paths = strv_free(c->no_exec_paths);
5367         c->exec_search_path = strv_free(c->exec_search_path);
5368 
5369         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5370         c->bind_mounts = NULL;
5371         c->n_bind_mounts = 0;
5372         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5373         c->temporary_filesystems = NULL;
5374         c->n_temporary_filesystems = 0;
5375         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5376 
5377         cpu_set_reset(&c->cpu_set);
5378         numa_policy_reset(&c->numa_policy);
5379 
5380         c->utmp_id = mfree(c->utmp_id);
5381         c->selinux_context = mfree(c->selinux_context);
5382         c->apparmor_profile = mfree(c->apparmor_profile);
5383         c->smack_process_label = mfree(c->smack_process_label);
5384 
5385         c->restrict_filesystems = set_free(c->restrict_filesystems);
5386 
5387         c->syscall_filter = hashmap_free(c->syscall_filter);
5388         c->syscall_archs = set_free(c->syscall_archs);
5389         c->address_families = set_free(c->address_families);
5390 
5391         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5392                 exec_directory_done(&c->directories[t]);
5393 
5394         c->log_level_max = -1;
5395 
5396         exec_context_free_log_extra_fields(c);
5397 
5398         c->log_ratelimit_interval_usec = 0;
5399         c->log_ratelimit_burst = 0;
5400 
5401         c->stdin_data = mfree(c->stdin_data);
5402         c->stdin_data_size = 0;
5403 
5404         c->network_namespace_path = mfree(c->network_namespace_path);
5405         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5406 
5407         c->log_namespace = mfree(c->log_namespace);
5408 
5409         c->load_credentials = hashmap_free(c->load_credentials);
5410         c->set_credentials = hashmap_free(c->set_credentials);
5411 }
5412 
exec_context_destroy_runtime_directory(const ExecContext * c,const char * runtime_prefix)5413 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5414         assert(c);
5415 
5416         if (!runtime_prefix)
5417                 return 0;
5418 
5419         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5420                 _cleanup_free_ char *p = NULL;
5421 
5422                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5423                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5424                 else
5425                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5426                 if (!p)
5427                         return -ENOMEM;
5428 
5429                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5430                  * service next. */
5431                 (void) rm_rf(p, REMOVE_ROOT);
5432 
5433                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5434                         _cleanup_free_ char *symlink_abs = NULL;
5435 
5436                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5437                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5438                         else
5439                                 symlink_abs = path_join(runtime_prefix, *symlink);
5440                         if (!symlink_abs)
5441                                 return -ENOMEM;
5442 
5443                         (void) unlink(symlink_abs);
5444                 }
5445 
5446         }
5447 
5448         return 0;
5449 }
5450 
exec_context_destroy_credentials(const ExecContext * c,const char * runtime_prefix,const char * unit)5451 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5452         _cleanup_free_ char *p = NULL;
5453 
5454         assert(c);
5455 
5456         if (!runtime_prefix || !unit)
5457                 return 0;
5458 
5459         p = path_join(runtime_prefix, "credentials", unit);
5460         if (!p)
5461                 return -ENOMEM;
5462 
5463         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5464          * unmount it, and afterwards remove the mount point */
5465         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5466         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5467 
5468         return 0;
5469 }
5470 
exec_command_done(ExecCommand * c)5471 static void exec_command_done(ExecCommand *c) {
5472         assert(c);
5473 
5474         c->path = mfree(c->path);
5475         c->argv = strv_free(c->argv);
5476 }
5477 
exec_command_done_array(ExecCommand * c,size_t n)5478 void exec_command_done_array(ExecCommand *c, size_t n) {
5479         for (size_t i = 0; i < n; i++)
5480                 exec_command_done(c+i);
5481 }
5482 
exec_command_free_list(ExecCommand * c)5483 ExecCommand* exec_command_free_list(ExecCommand *c) {
5484         ExecCommand *i;
5485 
5486         while ((i = c)) {
5487                 LIST_REMOVE(command, c, i);
5488                 exec_command_done(i);
5489                 free(i);
5490         }
5491 
5492         return NULL;
5493 }
5494 
exec_command_free_array(ExecCommand ** c,size_t n)5495 void exec_command_free_array(ExecCommand **c, size_t n) {
5496         for (size_t i = 0; i < n; i++)
5497                 c[i] = exec_command_free_list(c[i]);
5498 }
5499 
exec_command_reset_status_array(ExecCommand * c,size_t n)5500 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5501         for (size_t i = 0; i < n; i++)
5502                 exec_status_reset(&c[i].exec_status);
5503 }
5504 
exec_command_reset_status_list_array(ExecCommand ** c,size_t n)5505 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5506         for (size_t i = 0; i < n; i++)
5507                 LIST_FOREACH(command, z, c[i])
5508                         exec_status_reset(&z->exec_status);
5509 }
5510 
5511 typedef struct InvalidEnvInfo {
5512         const Unit *unit;
5513         const char *path;
5514 } InvalidEnvInfo;
5515 
invalid_env(const char * p,void * userdata)5516 static void invalid_env(const char *p, void *userdata) {
5517         InvalidEnvInfo *info = userdata;
5518 
5519         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5520 }
5521 
exec_context_fdname(const ExecContext * c,int fd_index)5522 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5523         assert(c);
5524 
5525         switch (fd_index) {
5526 
5527         case STDIN_FILENO:
5528                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5529                         return NULL;
5530 
5531                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5532 
5533         case STDOUT_FILENO:
5534                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5535                         return NULL;
5536 
5537                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5538 
5539         case STDERR_FILENO:
5540                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5541                         return NULL;
5542 
5543                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5544 
5545         default:
5546                 return NULL;
5547         }
5548 }
5549 
exec_context_named_iofds(const ExecContext * c,const ExecParameters * p,int named_iofds[static3])5550 static int exec_context_named_iofds(
5551                 const ExecContext *c,
5552                 const ExecParameters *p,
5553                 int named_iofds[static 3]) {
5554 
5555         size_t targets;
5556         const char* stdio_fdname[3];
5557         size_t n_fds;
5558 
5559         assert(c);
5560         assert(p);
5561         assert(named_iofds);
5562 
5563         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5564                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5565                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5566 
5567         for (size_t i = 0; i < 3; i++)
5568                 stdio_fdname[i] = exec_context_fdname(c, i);
5569 
5570         n_fds = p->n_storage_fds + p->n_socket_fds;
5571 
5572         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5573                 if (named_iofds[STDIN_FILENO] < 0 &&
5574                     c->std_input == EXEC_INPUT_NAMED_FD &&
5575                     stdio_fdname[STDIN_FILENO] &&
5576                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5577 
5578                         named_iofds[STDIN_FILENO] = p->fds[i];
5579                         targets--;
5580 
5581                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5582                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5583                            stdio_fdname[STDOUT_FILENO] &&
5584                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5585 
5586                         named_iofds[STDOUT_FILENO] = p->fds[i];
5587                         targets--;
5588 
5589                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5590                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5591                            stdio_fdname[STDERR_FILENO] &&
5592                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5593 
5594                         named_iofds[STDERR_FILENO] = p->fds[i];
5595                         targets--;
5596                 }
5597 
5598         return targets == 0 ? 0 : -ENOENT;
5599 }
5600 
exec_context_load_environment(const Unit * unit,const ExecContext * c,char *** ret)5601 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5602         _cleanup_strv_free_ char **v = NULL;
5603         int r;
5604 
5605         assert(c);
5606         assert(ret);
5607 
5608         STRV_FOREACH(i, c->environment_files) {
5609                 _cleanup_globfree_ glob_t pglob = {};
5610                 bool ignore = false;
5611                 char *fn = *i;
5612 
5613                 if (fn[0] == '-') {
5614                         ignore = true;
5615                         fn++;
5616                 }
5617 
5618                 if (!path_is_absolute(fn)) {
5619                         if (ignore)
5620                                 continue;
5621                         return -EINVAL;
5622                 }
5623 
5624                 /* Filename supports globbing, take all matching files */
5625                 r = safe_glob(fn, 0, &pglob);
5626                 if (r < 0) {
5627                         if (ignore)
5628                                 continue;
5629                         return r;
5630                 }
5631 
5632                 /* When we don't match anything, -ENOENT should be returned */
5633                 assert(pglob.gl_pathc > 0);
5634 
5635                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5636                         _cleanup_strv_free_ char **p = NULL;
5637 
5638                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5639                         if (r < 0) {
5640                                 if (ignore)
5641                                         continue;
5642                                 return r;
5643                         }
5644 
5645                         /* Log invalid environment variables with filename */
5646                         if (p) {
5647                                 InvalidEnvInfo info = {
5648                                         .unit = unit,
5649                                         .path = pglob.gl_pathv[n]
5650                                 };
5651 
5652                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5653                         }
5654 
5655                         if (!v)
5656                                 v = TAKE_PTR(p);
5657                         else {
5658                                 char **m = strv_env_merge(v, p);
5659                                 if (!m)
5660                                         return -ENOMEM;
5661 
5662                                 strv_free_and_replace(v, m);
5663                         }
5664                 }
5665         }
5666 
5667         *ret = TAKE_PTR(v);
5668 
5669         return 0;
5670 }
5671 
tty_may_match_dev_console(const char * tty)5672 static bool tty_may_match_dev_console(const char *tty) {
5673         _cleanup_free_ char *resolved = NULL;
5674 
5675         if (!tty)
5676                 return true;
5677 
5678         tty = skip_dev_prefix(tty);
5679 
5680         /* trivial identity? */
5681         if (streq(tty, "console"))
5682                 return true;
5683 
5684         if (resolve_dev_console(&resolved) < 0)
5685                 return true; /* if we could not resolve, assume it may */
5686 
5687         /* "tty0" means the active VC, so it may be the same sometimes */
5688         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5689 }
5690 
exec_context_may_touch_tty(const ExecContext * ec)5691 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5692         assert(ec);
5693 
5694         return ec->tty_reset ||
5695                 ec->tty_vhangup ||
5696                 ec->tty_vt_disallocate ||
5697                 is_terminal_input(ec->std_input) ||
5698                 is_terminal_output(ec->std_output) ||
5699                 is_terminal_output(ec->std_error);
5700 }
5701 
exec_context_may_touch_console(const ExecContext * ec)5702 bool exec_context_may_touch_console(const ExecContext *ec) {
5703 
5704         return exec_context_may_touch_tty(ec) &&
5705                tty_may_match_dev_console(exec_context_tty_path(ec));
5706 }
5707 
strv_fprintf(FILE * f,char ** l)5708 static void strv_fprintf(FILE *f, char **l) {
5709         assert(f);
5710 
5711         STRV_FOREACH(g, l)
5712                 fprintf(f, " %s", *g);
5713 }
5714 
strv_dump(FILE * f,const char * prefix,const char * name,char ** strv)5715 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5716         assert(f);
5717         assert(prefix);
5718         assert(name);
5719 
5720         if (!strv_isempty(strv)) {
5721                 fprintf(f, "%s%s:", prefix, name);
5722                 strv_fprintf(f, strv);
5723                 fputs("\n", f);
5724         }
5725 }
5726 
exec_context_dump(const ExecContext * c,FILE * f,const char * prefix)5727 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5728         int r;
5729 
5730         assert(c);
5731         assert(f);
5732 
5733         prefix = strempty(prefix);
5734 
5735         fprintf(f,
5736                 "%sUMask: %04o\n"
5737                 "%sWorkingDirectory: %s\n"
5738                 "%sRootDirectory: %s\n"
5739                 "%sNonBlocking: %s\n"
5740                 "%sPrivateTmp: %s\n"
5741                 "%sPrivateDevices: %s\n"
5742                 "%sProtectKernelTunables: %s\n"
5743                 "%sProtectKernelModules: %s\n"
5744                 "%sProtectKernelLogs: %s\n"
5745                 "%sProtectClock: %s\n"
5746                 "%sProtectControlGroups: %s\n"
5747                 "%sPrivateNetwork: %s\n"
5748                 "%sPrivateUsers: %s\n"
5749                 "%sProtectHome: %s\n"
5750                 "%sProtectSystem: %s\n"
5751                 "%sMountAPIVFS: %s\n"
5752                 "%sIgnoreSIGPIPE: %s\n"
5753                 "%sMemoryDenyWriteExecute: %s\n"
5754                 "%sRestrictRealtime: %s\n"
5755                 "%sRestrictSUIDSGID: %s\n"
5756                 "%sKeyringMode: %s\n"
5757                 "%sProtectHostname: %s\n"
5758                 "%sProtectProc: %s\n"
5759                 "%sProcSubset: %s\n",
5760                 prefix, c->umask,
5761                 prefix, empty_to_root(c->working_directory),
5762                 prefix, empty_to_root(c->root_directory),
5763                 prefix, yes_no(c->non_blocking),
5764                 prefix, yes_no(c->private_tmp),
5765                 prefix, yes_no(c->private_devices),
5766                 prefix, yes_no(c->protect_kernel_tunables),
5767                 prefix, yes_no(c->protect_kernel_modules),
5768                 prefix, yes_no(c->protect_kernel_logs),
5769                 prefix, yes_no(c->protect_clock),
5770                 prefix, yes_no(c->protect_control_groups),
5771                 prefix, yes_no(c->private_network),
5772                 prefix, yes_no(c->private_users),
5773                 prefix, protect_home_to_string(c->protect_home),
5774                 prefix, protect_system_to_string(c->protect_system),
5775                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5776                 prefix, yes_no(c->ignore_sigpipe),
5777                 prefix, yes_no(c->memory_deny_write_execute),
5778                 prefix, yes_no(c->restrict_realtime),
5779                 prefix, yes_no(c->restrict_suid_sgid),
5780                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5781                 prefix, yes_no(c->protect_hostname),
5782                 prefix, protect_proc_to_string(c->protect_proc),
5783                 prefix, proc_subset_to_string(c->proc_subset));
5784 
5785         if (c->root_image)
5786                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5787 
5788         if (c->root_image_options) {
5789                 fprintf(f, "%sRootImageOptions:", prefix);
5790                 LIST_FOREACH(mount_options, o, c->root_image_options)
5791                         if (!isempty(o->options))
5792                                 fprintf(f, " %s:%s",
5793                                         partition_designator_to_string(o->partition_designator),
5794                                         o->options);
5795                 fprintf(f, "\n");
5796         }
5797 
5798         if (c->root_hash) {
5799                 _cleanup_free_ char *encoded = NULL;
5800                 encoded = hexmem(c->root_hash, c->root_hash_size);
5801                 if (encoded)
5802                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5803         }
5804 
5805         if (c->root_hash_path)
5806                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5807 
5808         if (c->root_hash_sig) {
5809                 _cleanup_free_ char *encoded = NULL;
5810                 ssize_t len;
5811                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5812                 if (len)
5813                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5814         }
5815 
5816         if (c->root_hash_sig_path)
5817                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5818 
5819         if (c->root_verity)
5820                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5821 
5822         STRV_FOREACH(e, c->environment)
5823                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5824 
5825         STRV_FOREACH(e, c->environment_files)
5826                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5827 
5828         STRV_FOREACH(e, c->pass_environment)
5829                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5830 
5831         STRV_FOREACH(e, c->unset_environment)
5832                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5833 
5834         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5835 
5836         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5837                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5838 
5839                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5840                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5841 
5842                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5843                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5844                 }
5845         }
5846 
5847         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5848 
5849         if (c->nice_set)
5850                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5851 
5852         if (c->oom_score_adjust_set)
5853                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5854 
5855         if (c->coredump_filter_set)
5856                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5857 
5858         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5859                 if (c->rlimit[i]) {
5860                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5861                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5862                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5863                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5864                 }
5865 
5866         if (c->ioprio_set) {
5867                 _cleanup_free_ char *class_str = NULL;
5868 
5869                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5870                 if (r >= 0)
5871                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5872 
5873                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5874         }
5875 
5876         if (c->cpu_sched_set) {
5877                 _cleanup_free_ char *policy_str = NULL;
5878 
5879                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5880                 if (r >= 0)
5881                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5882 
5883                 fprintf(f,
5884                         "%sCPUSchedulingPriority: %i\n"
5885                         "%sCPUSchedulingResetOnFork: %s\n",
5886                         prefix, c->cpu_sched_priority,
5887                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5888         }
5889 
5890         if (c->cpu_set.set) {
5891                 _cleanup_free_ char *affinity = NULL;
5892 
5893                 affinity = cpu_set_to_range_string(&c->cpu_set);
5894                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5895         }
5896 
5897         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5898                 _cleanup_free_ char *nodes = NULL;
5899 
5900                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5901                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5902                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5903         }
5904 
5905         if (c->timer_slack_nsec != NSEC_INFINITY)
5906                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5907 
5908         fprintf(f,
5909                 "%sStandardInput: %s\n"
5910                 "%sStandardOutput: %s\n"
5911                 "%sStandardError: %s\n",
5912                 prefix, exec_input_to_string(c->std_input),
5913                 prefix, exec_output_to_string(c->std_output),
5914                 prefix, exec_output_to_string(c->std_error));
5915 
5916         if (c->std_input == EXEC_INPUT_NAMED_FD)
5917                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5918         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5919                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5920         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5921                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5922 
5923         if (c->std_input == EXEC_INPUT_FILE)
5924                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5925         if (c->std_output == EXEC_OUTPUT_FILE)
5926                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5927         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5928                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5929         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5930                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5931         if (c->std_error == EXEC_OUTPUT_FILE)
5932                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5933         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5934                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5935         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5936                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5937 
5938         if (c->tty_path)
5939                 fprintf(f,
5940                         "%sTTYPath: %s\n"
5941                         "%sTTYReset: %s\n"
5942                         "%sTTYVHangup: %s\n"
5943                         "%sTTYVTDisallocate: %s\n"
5944                         "%sTTYRows: %u\n"
5945                         "%sTTYColumns: %u\n",
5946                         prefix, c->tty_path,
5947                         prefix, yes_no(c->tty_reset),
5948                         prefix, yes_no(c->tty_vhangup),
5949                         prefix, yes_no(c->tty_vt_disallocate),
5950                         prefix, c->tty_rows,
5951                         prefix, c->tty_cols);
5952 
5953         if (IN_SET(c->std_output,
5954                    EXEC_OUTPUT_KMSG,
5955                    EXEC_OUTPUT_JOURNAL,
5956                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5957                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5958             IN_SET(c->std_error,
5959                    EXEC_OUTPUT_KMSG,
5960                    EXEC_OUTPUT_JOURNAL,
5961                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5962                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5963 
5964                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5965 
5966                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5967                 if (r >= 0)
5968                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5969 
5970                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5971                 if (r >= 0)
5972                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5973         }
5974 
5975         if (c->log_level_max >= 0) {
5976                 _cleanup_free_ char *t = NULL;
5977 
5978                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5979 
5980                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5981         }
5982 
5983         if (c->log_ratelimit_interval_usec > 0)
5984                 fprintf(f,
5985                         "%sLogRateLimitIntervalSec: %s\n",
5986                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5987 
5988         if (c->log_ratelimit_burst > 0)
5989                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5990 
5991         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5992                 fprintf(f, "%sLogExtraFields: ", prefix);
5993                 fwrite(c->log_extra_fields[j].iov_base,
5994                        1, c->log_extra_fields[j].iov_len,
5995                        f);
5996                 fputc('\n', f);
5997         }
5998 
5999         if (c->log_namespace)
6000                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6001 
6002         if (c->secure_bits) {
6003                 _cleanup_free_ char *str = NULL;
6004 
6005                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6006                 if (r >= 0)
6007                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6008         }
6009 
6010         if (c->capability_bounding_set != CAP_ALL) {
6011                 _cleanup_free_ char *str = NULL;
6012 
6013                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6014                 if (r >= 0)
6015                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6016         }
6017 
6018         if (c->capability_ambient_set != 0) {
6019                 _cleanup_free_ char *str = NULL;
6020 
6021                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6022                 if (r >= 0)
6023                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6024         }
6025 
6026         if (c->user)
6027                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6028         if (c->group)
6029                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6030 
6031         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6032 
6033         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6034 
6035         if (c->pam_name)
6036                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6037 
6038         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6039         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6040         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6041         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6042         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6043         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6044 
6045         for (size_t i = 0; i < c->n_bind_mounts; i++)
6046                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6047                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6048                         c->bind_mounts[i].ignore_enoent ? "-": "",
6049                         c->bind_mounts[i].source,
6050                         c->bind_mounts[i].destination,
6051                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6052 
6053         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6054                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6055 
6056                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6057                         t->path,
6058                         isempty(t->options) ? "" : ":",
6059                         strempty(t->options));
6060         }
6061 
6062         if (c->utmp_id)
6063                 fprintf(f,
6064                         "%sUtmpIdentifier: %s\n",
6065                         prefix, c->utmp_id);
6066 
6067         if (c->selinux_context)
6068                 fprintf(f,
6069                         "%sSELinuxContext: %s%s\n",
6070                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6071 
6072         if (c->apparmor_profile)
6073                 fprintf(f,
6074                         "%sAppArmorProfile: %s%s\n",
6075                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6076 
6077         if (c->smack_process_label)
6078                 fprintf(f,
6079                         "%sSmackProcessLabel: %s%s\n",
6080                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6081 
6082         if (c->personality != PERSONALITY_INVALID)
6083                 fprintf(f,
6084                         "%sPersonality: %s\n",
6085                         prefix, strna(personality_to_string(c->personality)));
6086 
6087         fprintf(f,
6088                 "%sLockPersonality: %s\n",
6089                 prefix, yes_no(c->lock_personality));
6090 
6091         if (c->syscall_filter) {
6092                 fprintf(f,
6093                         "%sSystemCallFilter: ",
6094                         prefix);
6095 
6096                 if (!c->syscall_allow_list)
6097                         fputc('~', f);
6098 
6099 #if HAVE_SECCOMP
6100                 void *id, *val;
6101                 bool first = true;
6102                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6103                         _cleanup_free_ char *name = NULL;
6104                         const char *errno_name = NULL;
6105                         int num = PTR_TO_INT(val);
6106 
6107                         if (first)
6108                                 first = false;
6109                         else
6110                                 fputc(' ', f);
6111 
6112                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6113                         fputs(strna(name), f);
6114 
6115                         if (num >= 0) {
6116                                 errno_name = seccomp_errno_or_action_to_string(num);
6117                                 if (errno_name)
6118                                         fprintf(f, ":%s", errno_name);
6119                                 else
6120                                         fprintf(f, ":%d", num);
6121                         }
6122                 }
6123 #endif
6124 
6125                 fputc('\n', f);
6126         }
6127 
6128         if (c->syscall_archs) {
6129                 fprintf(f,
6130                         "%sSystemCallArchitectures:",
6131                         prefix);
6132 
6133 #if HAVE_SECCOMP
6134                 void *id;
6135                 SET_FOREACH(id, c->syscall_archs)
6136                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6137 #endif
6138                 fputc('\n', f);
6139         }
6140 
6141         if (exec_context_restrict_namespaces_set(c)) {
6142                 _cleanup_free_ char *s = NULL;
6143 
6144                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6145                 if (r >= 0)
6146                         fprintf(f, "%sRestrictNamespaces: %s\n",
6147                                 prefix, strna(s));
6148         }
6149 
6150 #if HAVE_LIBBPF
6151         if (exec_context_restrict_filesystems_set(c)) {
6152                 char *fs;
6153                 SET_FOREACH(fs, c->restrict_filesystems)
6154                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6155         }
6156 #endif
6157 
6158         if (c->network_namespace_path)
6159                 fprintf(f,
6160                         "%sNetworkNamespacePath: %s\n",
6161                         prefix, c->network_namespace_path);
6162 
6163         if (c->syscall_errno > 0) {
6164                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6165 
6166 #if HAVE_SECCOMP
6167                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6168                 if (errno_name)
6169                         fputs(errno_name, f);
6170                 else
6171                         fprintf(f, "%d", c->syscall_errno);
6172 #endif
6173                 fputc('\n', f);
6174         }
6175 
6176         for (size_t i = 0; i < c->n_mount_images; i++) {
6177                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6178                         c->mount_images[i].ignore_enoent ? "-": "",
6179                         c->mount_images[i].source,
6180                         c->mount_images[i].destination);
6181                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6182                         fprintf(f, ":%s:%s",
6183                                 partition_designator_to_string(o->partition_designator),
6184                                 strempty(o->options));
6185                 fprintf(f, "\n");
6186         }
6187 
6188         for (size_t i = 0; i < c->n_extension_images; i++) {
6189                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6190                         c->extension_images[i].ignore_enoent ? "-": "",
6191                         c->extension_images[i].source);
6192                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6193                         fprintf(f, ":%s:%s",
6194                                 partition_designator_to_string(o->partition_designator),
6195                                 strempty(o->options));
6196                 fprintf(f, "\n");
6197         }
6198 
6199         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6200 }
6201 
exec_context_maintains_privileges(const ExecContext * c)6202 bool exec_context_maintains_privileges(const ExecContext *c) {
6203         assert(c);
6204 
6205         /* Returns true if the process forked off would run under
6206          * an unchanged UID or as root. */
6207 
6208         if (!c->user)
6209                 return true;
6210 
6211         if (streq(c->user, "root") || streq(c->user, "0"))
6212                 return true;
6213 
6214         return false;
6215 }
6216 
exec_context_get_effective_ioprio(const ExecContext * c)6217 int exec_context_get_effective_ioprio(const ExecContext *c) {
6218         int p;
6219 
6220         assert(c);
6221 
6222         if (c->ioprio_set)
6223                 return c->ioprio;
6224 
6225         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6226         if (p < 0)
6227                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6228 
6229         return ioprio_normalize(p);
6230 }
6231 
exec_context_get_effective_mount_apivfs(const ExecContext * c)6232 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6233         assert(c);
6234 
6235         /* Explicit setting wins */
6236         if (c->mount_apivfs_set)
6237                 return c->mount_apivfs;
6238 
6239         /* Default to "yes" if root directory or image are specified */
6240         if (exec_context_with_rootfs(c))
6241                 return true;
6242 
6243         return false;
6244 }
6245 
exec_context_free_log_extra_fields(ExecContext * c)6246 void exec_context_free_log_extra_fields(ExecContext *c) {
6247         assert(c);
6248 
6249         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6250                 free(c->log_extra_fields[l].iov_base);
6251         c->log_extra_fields = mfree(c->log_extra_fields);
6252         c->n_log_extra_fields = 0;
6253 }
6254 
exec_context_revert_tty(ExecContext * c)6255 void exec_context_revert_tty(ExecContext *c) {
6256         _cleanup_close_ int fd = -1;
6257         const char *path;
6258         struct stat st;
6259         int r;
6260 
6261         assert(c);
6262 
6263         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6264         exec_context_tty_reset(c, NULL);
6265 
6266         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6267          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6268          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6269         if (!exec_context_may_touch_tty(c))
6270                 return;
6271 
6272         path = exec_context_tty_path(c);
6273         if (!path)
6274                 return;
6275 
6276         fd = open(path, O_PATH|O_CLOEXEC);
6277         if (fd < 0)
6278                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6279                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6280                                              path);
6281 
6282         if (fstat(fd, &st) < 0)
6283                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6284 
6285         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6286          * if things are a character device, since a proper check either means we'd have to open the TTY and
6287          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6288          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6289          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6290         if (!S_ISCHR(st.st_mode))
6291                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6292 
6293         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6294         if (r < 0)
6295                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6296 }
6297 
exec_context_get_clean_directories(ExecContext * c,char ** prefix,ExecCleanMask mask,char *** ret)6298 int exec_context_get_clean_directories(
6299                 ExecContext *c,
6300                 char **prefix,
6301                 ExecCleanMask mask,
6302                 char ***ret) {
6303 
6304         _cleanup_strv_free_ char **l = NULL;
6305         int r;
6306 
6307         assert(c);
6308         assert(prefix);
6309         assert(ret);
6310 
6311         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6312                 if (!FLAGS_SET(mask, 1U << t))
6313                         continue;
6314 
6315                 if (!prefix[t])
6316                         continue;
6317 
6318                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6319                         char *j;
6320 
6321                         j = path_join(prefix[t], c->directories[t].items[i].path);
6322                         if (!j)
6323                                 return -ENOMEM;
6324 
6325                         r = strv_consume(&l, j);
6326                         if (r < 0)
6327                                 return r;
6328 
6329                         /* Also remove private directories unconditionally. */
6330                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6331                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6332                                 if (!j)
6333                                         return -ENOMEM;
6334 
6335                                 r = strv_consume(&l, j);
6336                                 if (r < 0)
6337                                         return r;
6338                         }
6339 
6340                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6341                                 j = path_join(prefix[t], *symlink);
6342                                 if (!j)
6343                                         return -ENOMEM;
6344 
6345                                 r = strv_consume(&l, j);
6346                                 if (r < 0)
6347                                         return r;
6348                         }
6349                 }
6350         }
6351 
6352         *ret = TAKE_PTR(l);
6353         return 0;
6354 }
6355 
exec_context_get_clean_mask(ExecContext * c,ExecCleanMask * ret)6356 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6357         ExecCleanMask mask = 0;
6358 
6359         assert(c);
6360         assert(ret);
6361 
6362         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6363                 if (c->directories[t].n_items > 0)
6364                         mask |= 1U << t;
6365 
6366         *ret = mask;
6367         return 0;
6368 }
6369 
exec_status_start(ExecStatus * s,pid_t pid)6370 void exec_status_start(ExecStatus *s, pid_t pid) {
6371         assert(s);
6372 
6373         *s = (ExecStatus) {
6374                 .pid = pid,
6375         };
6376 
6377         dual_timestamp_get(&s->start_timestamp);
6378 }
6379 
exec_status_exit(ExecStatus * s,const ExecContext * context,pid_t pid,int code,int status)6380 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6381         assert(s);
6382 
6383         if (s->pid != pid)
6384                 *s = (ExecStatus) {
6385                         .pid = pid,
6386                 };
6387 
6388         dual_timestamp_get(&s->exit_timestamp);
6389 
6390         s->code = code;
6391         s->status = status;
6392 
6393         if (context && context->utmp_id)
6394                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6395 }
6396 
exec_status_reset(ExecStatus * s)6397 void exec_status_reset(ExecStatus *s) {
6398         assert(s);
6399 
6400         *s = (ExecStatus) {};
6401 }
6402 
exec_status_dump(const ExecStatus * s,FILE * f,const char * prefix)6403 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6404         assert(s);
6405         assert(f);
6406 
6407         if (s->pid <= 0)
6408                 return;
6409 
6410         prefix = strempty(prefix);
6411 
6412         fprintf(f,
6413                 "%sPID: "PID_FMT"\n",
6414                 prefix, s->pid);
6415 
6416         if (dual_timestamp_is_set(&s->start_timestamp))
6417                 fprintf(f,
6418                         "%sStart Timestamp: %s\n",
6419                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6420 
6421         if (dual_timestamp_is_set(&s->exit_timestamp))
6422                 fprintf(f,
6423                         "%sExit Timestamp: %s\n"
6424                         "%sExit Code: %s\n"
6425                         "%sExit Status: %i\n",
6426                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6427                         prefix, sigchld_code_to_string(s->code),
6428                         prefix, s->status);
6429 }
6430 
exec_command_dump(ExecCommand * c,FILE * f,const char * prefix)6431 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6432         _cleanup_free_ char *cmd = NULL;
6433         const char *prefix2;
6434 
6435         assert(c);
6436         assert(f);
6437 
6438         prefix = strempty(prefix);
6439         prefix2 = strjoina(prefix, "\t");
6440 
6441         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6442         fprintf(f,
6443                 "%sCommand Line: %s\n",
6444                 prefix, cmd ?: strerror_safe(ENOMEM));
6445 
6446         exec_status_dump(&c->exec_status, f, prefix2);
6447 }
6448 
exec_command_dump_list(ExecCommand * c,FILE * f,const char * prefix)6449 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6450         assert(f);
6451 
6452         prefix = strempty(prefix);
6453 
6454         LIST_FOREACH(command, i, c)
6455                 exec_command_dump(i, f, prefix);
6456 }
6457 
exec_command_append_list(ExecCommand ** l,ExecCommand * e)6458 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6459         ExecCommand *end;
6460 
6461         assert(l);
6462         assert(e);
6463 
6464         if (*l) {
6465                 /* It's kind of important, that we keep the order here */
6466                 LIST_FIND_TAIL(command, *l, end);
6467                 LIST_INSERT_AFTER(command, *l, end, e);
6468         } else
6469               *l = e;
6470 }
6471 
exec_command_set(ExecCommand * c,const char * path,...)6472 int exec_command_set(ExecCommand *c, const char *path, ...) {
6473         va_list ap;
6474         char **l, *p;
6475 
6476         assert(c);
6477         assert(path);
6478 
6479         va_start(ap, path);
6480         l = strv_new_ap(path, ap);
6481         va_end(ap);
6482 
6483         if (!l)
6484                 return -ENOMEM;
6485 
6486         p = strdup(path);
6487         if (!p) {
6488                 strv_free(l);
6489                 return -ENOMEM;
6490         }
6491 
6492         free_and_replace(c->path, p);
6493 
6494         return strv_free_and_replace(c->argv, l);
6495 }
6496 
exec_command_append(ExecCommand * c,const char * path,...)6497 int exec_command_append(ExecCommand *c, const char *path, ...) {
6498         _cleanup_strv_free_ char **l = NULL;
6499         va_list ap;
6500         int r;
6501 
6502         assert(c);
6503         assert(path);
6504 
6505         va_start(ap, path);
6506         l = strv_new_ap(path, ap);
6507         va_end(ap);
6508 
6509         if (!l)
6510                 return -ENOMEM;
6511 
6512         r = strv_extend_strv(&c->argv, l, false);
6513         if (r < 0)
6514                 return r;
6515 
6516         return 0;
6517 }
6518 
remove_tmpdir_thread(void * p)6519 static void *remove_tmpdir_thread(void *p) {
6520         _cleanup_free_ char *path = p;
6521 
6522         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6523         return NULL;
6524 }
6525 
exec_runtime_free(ExecRuntime * rt,bool destroy)6526 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6527         int r;
6528 
6529         if (!rt)
6530                 return NULL;
6531 
6532         if (rt->manager)
6533                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6534 
6535         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6536 
6537         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6538                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6539 
6540                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6541                 if (r < 0)
6542                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6543                 else
6544                         rt->tmp_dir = NULL;
6545         }
6546 
6547         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6548                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6549 
6550                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6551                 if (r < 0)
6552                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6553                 else
6554                         rt->var_tmp_dir = NULL;
6555         }
6556 
6557         rt->id = mfree(rt->id);
6558         rt->tmp_dir = mfree(rt->tmp_dir);
6559         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6560         safe_close_pair(rt->netns_storage_socket);
6561         safe_close_pair(rt->ipcns_storage_socket);
6562         return mfree(rt);
6563 }
6564 
exec_runtime_freep(ExecRuntime ** rt)6565 static void exec_runtime_freep(ExecRuntime **rt) {
6566         (void) exec_runtime_free(*rt, false);
6567 }
6568 
exec_runtime_allocate(ExecRuntime ** ret,const char * id)6569 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6570         _cleanup_free_ char *id_copy = NULL;
6571         ExecRuntime *n;
6572 
6573         assert(ret);
6574 
6575         id_copy = strdup(id);
6576         if (!id_copy)
6577                 return -ENOMEM;
6578 
6579         n = new(ExecRuntime, 1);
6580         if (!n)
6581                 return -ENOMEM;
6582 
6583         *n = (ExecRuntime) {
6584                 .id = TAKE_PTR(id_copy),
6585                 .netns_storage_socket = { -1, -1 },
6586                 .ipcns_storage_socket = { -1, -1 },
6587         };
6588 
6589         *ret = n;
6590         return 0;
6591 }
6592 
exec_runtime_add(Manager * m,const char * id,char ** tmp_dir,char ** var_tmp_dir,int netns_storage_socket[2],int ipcns_storage_socket[2],ExecRuntime ** ret)6593 static int exec_runtime_add(
6594                 Manager *m,
6595                 const char *id,
6596                 char **tmp_dir,
6597                 char **var_tmp_dir,
6598                 int netns_storage_socket[2],
6599                 int ipcns_storage_socket[2],
6600                 ExecRuntime **ret) {
6601 
6602         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6603         int r;
6604 
6605         assert(m);
6606         assert(id);
6607 
6608         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6609 
6610         r = exec_runtime_allocate(&rt, id);
6611         if (r < 0)
6612                 return r;
6613 
6614         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6615         if (r < 0)
6616                 return r;
6617 
6618         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6619         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6620         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6621 
6622         if (netns_storage_socket) {
6623                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6624                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6625         }
6626 
6627         if (ipcns_storage_socket) {
6628                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6629                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6630         }
6631 
6632         rt->manager = m;
6633 
6634         if (ret)
6635                 *ret = rt;
6636         /* do not remove created ExecRuntime object when the operation succeeds. */
6637         TAKE_PTR(rt);
6638         return 0;
6639 }
6640 
exec_runtime_make(Manager * m,const ExecContext * c,const char * id,ExecRuntime ** ret)6641 static int exec_runtime_make(
6642                 Manager *m,
6643                 const ExecContext *c,
6644                 const char *id,
6645                 ExecRuntime **ret) {
6646 
6647         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6648         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6649         int r;
6650 
6651         assert(m);
6652         assert(c);
6653         assert(id);
6654 
6655         /* It is not necessary to create ExecRuntime object. */
6656         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6657                 *ret = NULL;
6658                 return 0;
6659         }
6660 
6661         if (c->private_tmp &&
6662             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6663               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6664                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6665                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6666                 if (r < 0)
6667                         return r;
6668         }
6669 
6670         if (c->private_network || c->network_namespace_path) {
6671                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6672                         return -errno;
6673         }
6674 
6675         if (c->private_ipc || c->ipc_namespace_path) {
6676                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6677                         return -errno;
6678         }
6679 
6680         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6681         if (r < 0)
6682                 return r;
6683 
6684         return 1;
6685 }
6686 
exec_runtime_acquire(Manager * m,const ExecContext * c,const char * id,bool create,ExecRuntime ** ret)6687 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6688         ExecRuntime *rt;
6689         int r;
6690 
6691         assert(m);
6692         assert(id);
6693         assert(ret);
6694 
6695         rt = hashmap_get(m->exec_runtime_by_id, id);
6696         if (rt)
6697                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6698                 goto ref;
6699 
6700         if (!create) {
6701                 *ret = NULL;
6702                 return 0;
6703         }
6704 
6705         /* If not found, then create a new object. */
6706         r = exec_runtime_make(m, c, id, &rt);
6707         if (r < 0)
6708                 return r;
6709         if (r == 0) {
6710                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6711                 *ret = NULL;
6712                 return 0;
6713         }
6714 
6715 ref:
6716         /* increment reference counter. */
6717         rt->n_ref++;
6718         *ret = rt;
6719         return 1;
6720 }
6721 
exec_runtime_unref(ExecRuntime * rt,bool destroy)6722 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6723         if (!rt)
6724                 return NULL;
6725 
6726         assert(rt->n_ref > 0);
6727 
6728         rt->n_ref--;
6729         if (rt->n_ref > 0)
6730                 return NULL;
6731 
6732         return exec_runtime_free(rt, destroy);
6733 }
6734 
exec_runtime_serialize(const Manager * m,FILE * f,FDSet * fds)6735 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6736         ExecRuntime *rt;
6737 
6738         assert(m);
6739         assert(f);
6740         assert(fds);
6741 
6742         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6743                 fprintf(f, "exec-runtime=%s", rt->id);
6744 
6745                 if (rt->tmp_dir)
6746                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6747 
6748                 if (rt->var_tmp_dir)
6749                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6750 
6751                 if (rt->netns_storage_socket[0] >= 0) {
6752                         int copy;
6753 
6754                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6755                         if (copy < 0)
6756                                 return copy;
6757 
6758                         fprintf(f, " netns-socket-0=%i", copy);
6759                 }
6760 
6761                 if (rt->netns_storage_socket[1] >= 0) {
6762                         int copy;
6763 
6764                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6765                         if (copy < 0)
6766                                 return copy;
6767 
6768                         fprintf(f, " netns-socket-1=%i", copy);
6769                 }
6770 
6771                 if (rt->ipcns_storage_socket[0] >= 0) {
6772                         int copy;
6773 
6774                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6775                         if (copy < 0)
6776                                 return copy;
6777 
6778                         fprintf(f, " ipcns-socket-0=%i", copy);
6779                 }
6780 
6781                 if (rt->ipcns_storage_socket[1] >= 0) {
6782                         int copy;
6783 
6784                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6785                         if (copy < 0)
6786                                 return copy;
6787 
6788                         fprintf(f, " ipcns-socket-1=%i", copy);
6789                 }
6790 
6791                 fputc('\n', f);
6792         }
6793 
6794         return 0;
6795 }
6796 
exec_runtime_deserialize_compat(Unit * u,const char * key,const char * value,FDSet * fds)6797 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6798         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6799         ExecRuntime *rt;
6800         int r;
6801 
6802         /* This is for the migration from old (v237 or earlier) deserialization text.
6803          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6804          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6805          * so or not from the serialized text, then we always creates a new object owned by this. */
6806 
6807         assert(u);
6808         assert(key);
6809         assert(value);
6810 
6811         /* Manager manages ExecRuntime objects by the unit id.
6812          * So, we omit the serialized text when the unit does not have id (yet?)... */
6813         if (isempty(u->id)) {
6814                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6815                 return 0;
6816         }
6817 
6818         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6819                 return log_oom();
6820 
6821         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6822         if (!rt) {
6823                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6824                         return log_oom();
6825 
6826                 rt = rt_create;
6827         }
6828 
6829         if (streq(key, "tmp-dir")) {
6830                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6831                         return -ENOMEM;
6832 
6833         } else if (streq(key, "var-tmp-dir")) {
6834                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6835                         return -ENOMEM;
6836 
6837         } else if (streq(key, "netns-socket-0")) {
6838                 int fd;
6839 
6840                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6841                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6842                         return 0;
6843                 }
6844 
6845                 safe_close(rt->netns_storage_socket[0]);
6846                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6847 
6848         } else if (streq(key, "netns-socket-1")) {
6849                 int fd;
6850 
6851                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6852                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6853                         return 0;
6854                 }
6855 
6856                 safe_close(rt->netns_storage_socket[1]);
6857                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6858 
6859         } else
6860                 return 0;
6861 
6862         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6863         if (rt_create) {
6864                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6865                 if (r < 0) {
6866                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6867                         return 0;
6868                 }
6869 
6870                 rt_create->manager = u->manager;
6871 
6872                 /* Avoid cleanup */
6873                 TAKE_PTR(rt_create);
6874         }
6875 
6876         return 1;
6877 }
6878 
exec_runtime_deserialize_one(Manager * m,const char * value,FDSet * fds)6879 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6880         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6881         char *id = NULL;
6882         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6883         const char *p, *v = value;
6884         size_t n;
6885 
6886         assert(m);
6887         assert(value);
6888         assert(fds);
6889 
6890         n = strcspn(v, " ");
6891         id = strndupa_safe(v, n);
6892         if (v[n] != ' ')
6893                 goto finalize;
6894         p = v + n + 1;
6895 
6896         v = startswith(p, "tmp-dir=");
6897         if (v) {
6898                 n = strcspn(v, " ");
6899                 tmp_dir = strndup(v, n);
6900                 if (!tmp_dir)
6901                         return log_oom();
6902                 if (v[n] != ' ')
6903                         goto finalize;
6904                 p = v + n + 1;
6905         }
6906 
6907         v = startswith(p, "var-tmp-dir=");
6908         if (v) {
6909                 n = strcspn(v, " ");
6910                 var_tmp_dir = strndup(v, n);
6911                 if (!var_tmp_dir)
6912                         return log_oom();
6913                 if (v[n] != ' ')
6914                         goto finalize;
6915                 p = v + n + 1;
6916         }
6917 
6918         v = startswith(p, "netns-socket-0=");
6919         if (v) {
6920                 char *buf;
6921 
6922                 n = strcspn(v, " ");
6923                 buf = strndupa_safe(v, n);
6924 
6925                 r = safe_atoi(buf, &netns_fdpair[0]);
6926                 if (r < 0)
6927                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6928                 if (!fdset_contains(fds, netns_fdpair[0]))
6929                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6930                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6931                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6932                 if (v[n] != ' ')
6933                         goto finalize;
6934                 p = v + n + 1;
6935         }
6936 
6937         v = startswith(p, "netns-socket-1=");
6938         if (v) {
6939                 char *buf;
6940 
6941                 n = strcspn(v, " ");
6942                 buf = strndupa_safe(v, n);
6943 
6944                 r = safe_atoi(buf, &netns_fdpair[1]);
6945                 if (r < 0)
6946                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6947                 if (!fdset_contains(fds, netns_fdpair[1]))
6948                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6949                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6950                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6951                 if (v[n] != ' ')
6952                         goto finalize;
6953                 p = v + n + 1;
6954         }
6955 
6956         v = startswith(p, "ipcns-socket-0=");
6957         if (v) {
6958                 char *buf;
6959 
6960                 n = strcspn(v, " ");
6961                 buf = strndupa_safe(v, n);
6962 
6963                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6964                 if (r < 0)
6965                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6966                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6967                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6968                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6969                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6970                 if (v[n] != ' ')
6971                         goto finalize;
6972                 p = v + n + 1;
6973         }
6974 
6975         v = startswith(p, "ipcns-socket-1=");
6976         if (v) {
6977                 char *buf;
6978 
6979                 n = strcspn(v, " ");
6980                 buf = strndupa_safe(v, n);
6981 
6982                 r = safe_atoi(buf, &ipcns_fdpair[1]);
6983                 if (r < 0)
6984                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6985                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6986                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6987                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6988                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6989         }
6990 
6991 finalize:
6992         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6993         if (r < 0)
6994                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6995         return 0;
6996 }
6997 
exec_runtime_vacuum(Manager * m)6998 void exec_runtime_vacuum(Manager *m) {
6999         ExecRuntime *rt;
7000 
7001         assert(m);
7002 
7003         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7004 
7005         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
7006                 if (rt->n_ref > 0)
7007                         continue;
7008 
7009                 (void) exec_runtime_free(rt, false);
7010         }
7011 }
7012 
exec_params_clear(ExecParameters * p)7013 void exec_params_clear(ExecParameters *p) {
7014         if (!p)
7015                 return;
7016 
7017         p->environment = strv_free(p->environment);
7018         p->fd_names = strv_free(p->fd_names);
7019         p->fds = mfree(p->fds);
7020         p->exec_fd = safe_close(p->exec_fd);
7021 }
7022 
exec_set_credential_free(ExecSetCredential * sc)7023 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7024         if (!sc)
7025                 return NULL;
7026 
7027         free(sc->id);
7028         free(sc->data);
7029         return mfree(sc);
7030 }
7031 
exec_load_credential_free(ExecLoadCredential * lc)7032 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7033         if (!lc)
7034                 return NULL;
7035 
7036         free(lc->id);
7037         free(lc->path);
7038         return mfree(lc);
7039 }
7040 
exec_directory_done(ExecDirectory * d)7041 void exec_directory_done(ExecDirectory *d) {
7042         if (!d)
7043                 return;
7044 
7045         for (size_t i = 0; i < d->n_items; i++) {
7046                 free(d->items[i].path);
7047                 strv_free(d->items[i].symlinks);
7048         }
7049 
7050         d->items = mfree(d->items);
7051         d->n_items = 0;
7052         d->mode = 0755;
7053 }
7054 
exec_directory_add(ExecDirectoryItem ** d,size_t * n,const char * path,char ** symlinks)7055 int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
7056         _cleanup_strv_free_ char **s = NULL;
7057         _cleanup_free_ char *p = NULL;
7058 
7059         assert(d);
7060         assert(n);
7061         assert(path);
7062 
7063         p = strdup(path);
7064         if (!p)
7065                 return -ENOMEM;
7066 
7067         if (symlinks) {
7068                 s = strv_copy(symlinks);
7069                 if (!s)
7070                         return -ENOMEM;
7071         }
7072 
7073         if (!GREEDY_REALLOC(*d, *n + 1))
7074                 return -ENOMEM;
7075 
7076         (*d)[(*n) ++] = (ExecDirectoryItem) {
7077                 .path = TAKE_PTR(p),
7078                 .symlinks = TAKE_PTR(s),
7079         };
7080 
7081         return 0;
7082 }
7083 
7084 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7085 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7086 
7087 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7088         [EXEC_INPUT_NULL] = "null",
7089         [EXEC_INPUT_TTY] = "tty",
7090         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7091         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7092         [EXEC_INPUT_SOCKET] = "socket",
7093         [EXEC_INPUT_NAMED_FD] = "fd",
7094         [EXEC_INPUT_DATA] = "data",
7095         [EXEC_INPUT_FILE] = "file",
7096 };
7097 
7098 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7099 
7100 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7101         [EXEC_OUTPUT_INHERIT] = "inherit",
7102         [EXEC_OUTPUT_NULL] = "null",
7103         [EXEC_OUTPUT_TTY] = "tty",
7104         [EXEC_OUTPUT_KMSG] = "kmsg",
7105         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7106         [EXEC_OUTPUT_JOURNAL] = "journal",
7107         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7108         [EXEC_OUTPUT_SOCKET] = "socket",
7109         [EXEC_OUTPUT_NAMED_FD] = "fd",
7110         [EXEC_OUTPUT_FILE] = "file",
7111         [EXEC_OUTPUT_FILE_APPEND] = "append",
7112         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7113 };
7114 
7115 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7116 
7117 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7118         [EXEC_UTMP_INIT] = "init",
7119         [EXEC_UTMP_LOGIN] = "login",
7120         [EXEC_UTMP_USER] = "user",
7121 };
7122 
7123 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7124 
7125 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7126         [EXEC_PRESERVE_NO] = "no",
7127         [EXEC_PRESERVE_YES] = "yes",
7128         [EXEC_PRESERVE_RESTART] = "restart",
7129 };
7130 
7131 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7132 
7133 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7134 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7135         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7136         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7137         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7138         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7139         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7140 };
7141 
7142 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7143 
7144 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7145 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7146         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7147         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7148         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7149         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7150         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7151 };
7152 
7153 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7154 
7155 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7156  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7157  * directories, specifically .timer units with their timestamp touch file. */
7158 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7159         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7160         [EXEC_DIRECTORY_STATE] = "state",
7161         [EXEC_DIRECTORY_CACHE] = "cache",
7162         [EXEC_DIRECTORY_LOGS] = "logs",
7163         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7164 };
7165 
7166 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7167 
7168 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7169  * the service payload in. */
7170 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7171         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7172         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7173         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7174         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7175         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7176 };
7177 
7178 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7179 
7180 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7181         [EXEC_KEYRING_INHERIT] = "inherit",
7182         [EXEC_KEYRING_PRIVATE] = "private",
7183         [EXEC_KEYRING_SHARED] = "shared",
7184 };
7185 
7186 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);
7187