1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11 
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15 
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "nsflags.h"
22 #include "nulstr-util.h"
23 #include "process-util.h"
24 #include "seccomp-util.h"
25 #include "set.h"
26 #include "string-util.h"
27 #include "strv.h"
28 
29 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
30 uint32_t seccomp_local_archs[] = {
31 
32         /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
33 
34 #if defined(__x86_64__) && defined(__ILP32__)
35                 SCMP_ARCH_X86,
36                 SCMP_ARCH_X86_64,
37                 SCMP_ARCH_X32,         /* native */
38 #elif defined(__x86_64__) && !defined(__ILP32__)
39                 SCMP_ARCH_X86,
40                 SCMP_ARCH_X32,
41                 SCMP_ARCH_X86_64,      /* native */
42 #elif defined(__i386__)
43                 SCMP_ARCH_X86,
44 #elif defined(__aarch64__)
45                 SCMP_ARCH_ARM,
46                 SCMP_ARCH_AARCH64,     /* native */
47 #elif defined(__arm__)
48                 SCMP_ARCH_ARM,
49 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50                 SCMP_ARCH_MIPSEL,
51                 SCMP_ARCH_MIPS,        /* native */
52 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53                 SCMP_ARCH_MIPS,
54                 SCMP_ARCH_MIPSEL,      /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56                 SCMP_ARCH_MIPSEL,
57                 SCMP_ARCH_MIPS,
58                 SCMP_ARCH_MIPSEL64N32,
59                 SCMP_ARCH_MIPS64N32,
60                 SCMP_ARCH_MIPSEL64,
61                 SCMP_ARCH_MIPS64,      /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63                 SCMP_ARCH_MIPS,
64                 SCMP_ARCH_MIPSEL,
65                 SCMP_ARCH_MIPS64N32,
66                 SCMP_ARCH_MIPSEL64N32,
67                 SCMP_ARCH_MIPS64,
68                 SCMP_ARCH_MIPSEL64,    /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70                 SCMP_ARCH_MIPSEL,
71                 SCMP_ARCH_MIPS,
72                 SCMP_ARCH_MIPSEL64,
73                 SCMP_ARCH_MIPS64,
74                 SCMP_ARCH_MIPSEL64N32,
75                 SCMP_ARCH_MIPS64N32,   /* native */
76 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77                 SCMP_ARCH_MIPS,
78                 SCMP_ARCH_MIPSEL,
79                 SCMP_ARCH_MIPS64,
80                 SCMP_ARCH_MIPSEL64,
81                 SCMP_ARCH_MIPS64N32,
82                 SCMP_ARCH_MIPSEL64N32, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
84                 SCMP_ARCH_PPC,
85                 SCMP_ARCH_PPC64LE,
86                 SCMP_ARCH_PPC64,       /* native */
87 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
88                 SCMP_ARCH_PPC,
89                 SCMP_ARCH_PPC64,
90                 SCMP_ARCH_PPC64LE,     /* native */
91 #elif defined(__powerpc__)
92                 SCMP_ARCH_PPC,
93 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
94                 SCMP_ARCH_RISCV64,
95 #elif defined(__s390x__)
96                 SCMP_ARCH_S390,
97                 SCMP_ARCH_S390X,      /* native */
98 #elif defined(__s390__)
99                 SCMP_ARCH_S390,
100 #endif
101                 SECCOMP_LOCAL_ARCH_END
102         };
103 
seccomp_arch_to_string(uint32_t c)104 const char* seccomp_arch_to_string(uint32_t c) {
105         /* Maintain order used in <seccomp.h>.
106          *
107          * Names used here should be the same as those used for ConditionArchitecture=,
108          * except for "subarchitectures" like x32. */
109 
110         switch (c) {
111         case SCMP_ARCH_NATIVE:
112                 return "native";
113         case SCMP_ARCH_X86:
114                 return "x86";
115         case SCMP_ARCH_X86_64:
116                 return "x86-64";
117         case SCMP_ARCH_X32:
118                 return "x32";
119         case SCMP_ARCH_ARM:
120                 return "arm";
121         case SCMP_ARCH_AARCH64:
122                 return "arm64";
123         case SCMP_ARCH_MIPS:
124                 return "mips";
125         case SCMP_ARCH_MIPS64:
126                 return "mips64";
127         case SCMP_ARCH_MIPS64N32:
128                 return "mips64-n32";
129         case SCMP_ARCH_MIPSEL:
130                 return "mips-le";
131         case SCMP_ARCH_MIPSEL64:
132                 return "mips64-le";
133         case SCMP_ARCH_MIPSEL64N32:
134                 return "mips64-le-n32";
135         case SCMP_ARCH_PPC:
136                 return "ppc";
137         case SCMP_ARCH_PPC64:
138                 return "ppc64";
139         case SCMP_ARCH_PPC64LE:
140                 return "ppc64-le";
141 #ifdef SCMP_ARCH_RISCV64
142         case SCMP_ARCH_RISCV64:
143                 return "riscv64";
144 #endif
145         case SCMP_ARCH_S390:
146                 return "s390";
147         case SCMP_ARCH_S390X:
148                 return "s390x";
149         default:
150                 return NULL;
151         }
152 }
153 
seccomp_arch_from_string(const char * n,uint32_t * ret)154 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
155         if (!n)
156                 return -EINVAL;
157 
158         assert(ret);
159 
160         if (streq(n, "native"))
161                 *ret = SCMP_ARCH_NATIVE;
162         else if (streq(n, "x86"))
163                 *ret = SCMP_ARCH_X86;
164         else if (streq(n, "x86-64"))
165                 *ret = SCMP_ARCH_X86_64;
166         else if (streq(n, "x32"))
167                 *ret = SCMP_ARCH_X32;
168         else if (streq(n, "arm"))
169                 *ret = SCMP_ARCH_ARM;
170         else if (streq(n, "arm64"))
171                 *ret = SCMP_ARCH_AARCH64;
172         else if (streq(n, "mips"))
173                 *ret = SCMP_ARCH_MIPS;
174         else if (streq(n, "mips64"))
175                 *ret = SCMP_ARCH_MIPS64;
176         else if (streq(n, "mips64-n32"))
177                 *ret = SCMP_ARCH_MIPS64N32;
178         else if (streq(n, "mips-le"))
179                 *ret = SCMP_ARCH_MIPSEL;
180         else if (streq(n, "mips64-le"))
181                 *ret = SCMP_ARCH_MIPSEL64;
182         else if (streq(n, "mips64-le-n32"))
183                 *ret = SCMP_ARCH_MIPSEL64N32;
184         else if (streq(n, "ppc"))
185                 *ret = SCMP_ARCH_PPC;
186         else if (streq(n, "ppc64"))
187                 *ret = SCMP_ARCH_PPC64;
188         else if (streq(n, "ppc64-le"))
189                 *ret = SCMP_ARCH_PPC64LE;
190 #ifdef SCMP_ARCH_RISCV64
191         else if (streq(n, "riscv64"))
192                 *ret = SCMP_ARCH_RISCV64;
193 #endif
194         else if (streq(n, "s390"))
195                 *ret = SCMP_ARCH_S390;
196         else if (streq(n, "s390x"))
197                 *ret = SCMP_ARCH_S390X;
198         else
199                 return -EINVAL;
200 
201         return 0;
202 }
203 
seccomp_init_for_arch(scmp_filter_ctx * ret,uint32_t arch,uint32_t default_action)204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205         _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
206         int r;
207 
208         /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209          * any others. Also, turns off the NNP fiddling. */
210 
211         seccomp = seccomp_init(default_action);
212         if (!seccomp)
213                 return -ENOMEM;
214 
215         if (arch != SCMP_ARCH_NATIVE &&
216             arch != seccomp_arch_native()) {
217 
218                 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219                 if (r < 0)
220                         return r;
221 
222                 r = seccomp_arch_add(seccomp, arch);
223                 if (r < 0)
224                         return r;
225 
226                 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227                 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228                 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229         } else {
230                 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231                 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232         }
233 
234         r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235         if (r < 0)
236                 return r;
237 
238         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239         if (r < 0)
240                 return r;
241 
242 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
243         if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
244                 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
245                 if (r < 0)
246                         log_debug_errno(r, "Failed to enable seccomp event logging: %m");
247         }
248 #endif
249 
250         *ret = TAKE_PTR(seccomp);
251         return 0;
252 }
253 
is_basic_seccomp_available(void)254 static bool is_basic_seccomp_available(void) {
255         return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
256 }
257 
is_seccomp_filter_available(void)258 static bool is_seccomp_filter_available(void) {
259         return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
260                 errno == EFAULT;
261 }
262 
is_seccomp_available(void)263 bool is_seccomp_available(void) {
264         static int cached_enabled = -1;
265 
266         if (cached_enabled < 0) {
267                 int b;
268 
269                 b = getenv_bool_secure("SYSTEMD_SECCOMP");
270                 if (b != 0) {
271                         if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
272                                 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
273 
274                         cached_enabled =
275                                 is_basic_seccomp_available() &&
276                                 is_seccomp_filter_available();
277                 } else
278                         cached_enabled = false;
279         }
280 
281         return cached_enabled;
282 }
283 
284 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
285         [SYSCALL_FILTER_SET_DEFAULT] = {
286                 .name = "@default",
287                 .help = "System calls that are always permitted",
288                 .value =
289                 "arch_prctl\0"      /* Used during platform-specific initialization by ld-linux.so. */
290                 "brk\0"
291                 "cacheflush\0"
292                 "clock_getres\0"
293                 "clock_getres_time64\0"
294                 "clock_gettime\0"
295                 "clock_gettime64\0"
296                 "clock_nanosleep\0"
297                 "clock_nanosleep_time64\0"
298                 "execve\0"
299                 "exit\0"
300                 "exit_group\0"
301                 "futex\0"
302                 "futex_time64\0"
303                 "get_robust_list\0"
304                 "get_thread_area\0"
305                 "getegid\0"
306                 "getegid32\0"
307                 "geteuid\0"
308                 "geteuid32\0"
309                 "getgid\0"
310                 "getgid32\0"
311                 "getgroups\0"
312                 "getgroups32\0"
313                 "getpgid\0"
314                 "getpgrp\0"
315                 "getpid\0"
316                 "getppid\0"
317                 "getrandom\0"
318                 "getresgid\0"
319                 "getresgid32\0"
320                 "getresuid\0"
321                 "getresuid32\0"
322                 "getrlimit\0"      /* make sure processes can query stack size and such */
323                 "getsid\0"
324                 "gettid\0"
325                 "gettimeofday\0"
326                 "getuid\0"
327                 "getuid32\0"
328                 "membarrier\0"
329                 "mmap\0"
330                 "mmap2\0"
331                 "mprotect\0"
332                 "munmap\0"
333                 "nanosleep\0"
334                 "pause\0"
335                 "prlimit64\0"
336                 "restart_syscall\0"
337                 "rseq\0"
338                 "rt_sigreturn\0"
339                 "sched_getaffinity\0"
340                 "sched_yield\0"
341                 "set_robust_list\0"
342                 "set_thread_area\0"
343                 "set_tid_address\0"
344                 "set_tls\0"
345                 "sigreturn\0"
346                 "time\0"
347                 "ugetrlimit\0"
348         },
349         [SYSCALL_FILTER_SET_AIO] = {
350                 .name = "@aio",
351                 .help = "Asynchronous IO",
352                 .value =
353                 "io_cancel\0"
354                 "io_destroy\0"
355                 "io_getevents\0"
356                 "io_pgetevents\0"
357                 "io_pgetevents_time64\0"
358                 "io_setup\0"
359                 "io_submit\0"
360                 "io_uring_enter\0"
361                 "io_uring_register\0"
362                 "io_uring_setup\0"
363         },
364         [SYSCALL_FILTER_SET_BASIC_IO] = {
365                 .name = "@basic-io",
366                 .help = "Basic IO",
367                 .value =
368                 "_llseek\0"
369                 "close\0"
370                 "close_range\0"
371                 "dup\0"
372                 "dup2\0"
373                 "dup3\0"
374                 "lseek\0"
375                 "pread64\0"
376                 "preadv\0"
377                 "preadv2\0"
378                 "pwrite64\0"
379                 "pwritev\0"
380                 "pwritev2\0"
381                 "read\0"
382                 "readv\0"
383                 "write\0"
384                 "writev\0"
385         },
386         [SYSCALL_FILTER_SET_CHOWN] = {
387                 .name = "@chown",
388                 .help = "Change ownership of files and directories",
389                 .value =
390                 "chown\0"
391                 "chown32\0"
392                 "fchown\0"
393                 "fchown32\0"
394                 "fchownat\0"
395                 "lchown\0"
396                 "lchown32\0"
397         },
398         [SYSCALL_FILTER_SET_CLOCK] = {
399                 .name = "@clock",
400                 .help = "Change the system time",
401                 .value =
402                 "adjtimex\0"
403                 "clock_adjtime\0"
404                 "clock_adjtime64\0"
405                 "clock_settime\0"
406                 "clock_settime64\0"
407                 "settimeofday\0"
408         },
409         [SYSCALL_FILTER_SET_CPU_EMULATION] = {
410                 .name = "@cpu-emulation",
411                 .help = "System calls for CPU emulation functionality",
412                 .value =
413                 "modify_ldt\0"
414                 "subpage_prot\0"
415                 "switch_endian\0"
416                 "vm86\0"
417                 "vm86old\0"
418         },
419         [SYSCALL_FILTER_SET_DEBUG] = {
420                 .name = "@debug",
421                 .help = "Debugging, performance monitoring and tracing functionality",
422                 .value =
423                 "lookup_dcookie\0"
424                 "perf_event_open\0"
425                 "pidfd_getfd\0"
426                 "ptrace\0"
427                 "rtas\0"
428 #if defined __s390__ || defined __s390x__
429                 "s390_runtime_instr\0"
430 #endif
431                 "sys_debug_setcontext\0"
432         },
433         [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
434                 .name = "@file-system",
435                 .help = "File system operations",
436                 .value =
437                 "access\0"
438                 "chdir\0"
439                 "chmod\0"
440                 "close\0"
441                 "creat\0"
442                 "faccessat\0"
443                 "faccessat2\0"
444                 "fallocate\0"
445                 "fchdir\0"
446                 "fchmod\0"
447                 "fchmodat\0"
448                 "fcntl\0"
449                 "fcntl64\0"
450                 "fgetxattr\0"
451                 "flistxattr\0"
452                 "fremovexattr\0"
453                 "fsetxattr\0"
454                 "fstat\0"
455                 "fstat64\0"
456                 "fstatat64\0"
457                 "fstatfs\0"
458                 "fstatfs64\0"
459                 "ftruncate\0"
460                 "ftruncate64\0"
461                 "futimesat\0"
462                 "getcwd\0"
463                 "getdents\0"
464                 "getdents64\0"
465                 "getxattr\0"
466                 "inotify_add_watch\0"
467                 "inotify_init\0"
468                 "inotify_init1\0"
469                 "inotify_rm_watch\0"
470                 "lgetxattr\0"
471                 "link\0"
472                 "linkat\0"
473                 "listxattr\0"
474                 "llistxattr\0"
475                 "lremovexattr\0"
476                 "lsetxattr\0"
477                 "lstat\0"
478                 "lstat64\0"
479                 "mkdir\0"
480                 "mkdirat\0"
481                 "mknod\0"
482                 "mknodat\0"
483                 "newfstatat\0"
484                 "oldfstat\0"
485                 "oldlstat\0"
486                 "oldstat\0"
487                 "open\0"
488                 "openat\0"
489                 "openat2\0"
490                 "readlink\0"
491                 "readlinkat\0"
492                 "removexattr\0"
493                 "rename\0"
494                 "renameat\0"
495                 "renameat2\0"
496                 "rmdir\0"
497                 "setxattr\0"
498                 "stat\0"
499                 "stat64\0"
500                 "statfs\0"
501                 "statfs64\0"
502                 "statx\0"
503                 "symlink\0"
504                 "symlinkat\0"
505                 "truncate\0"
506                 "truncate64\0"
507                 "unlink\0"
508                 "unlinkat\0"
509                 "utime\0"
510                 "utimensat\0"
511                 "utimensat_time64\0"
512                 "utimes\0"
513         },
514         [SYSCALL_FILTER_SET_IO_EVENT] = {
515                 .name = "@io-event",
516                 .help = "Event loop system calls",
517                 .value =
518                 "_newselect\0"
519                 "epoll_create\0"
520                 "epoll_create1\0"
521                 "epoll_ctl\0"
522                 "epoll_ctl_old\0"
523                 "epoll_pwait\0"
524                 "epoll_pwait2\0"
525                 "epoll_wait\0"
526                 "epoll_wait_old\0"
527                 "eventfd\0"
528                 "eventfd2\0"
529                 "poll\0"
530                 "ppoll\0"
531                 "ppoll_time64\0"
532                 "pselect6\0"
533                 "pselect6_time64\0"
534                 "select\0"
535         },
536         [SYSCALL_FILTER_SET_IPC] = {
537                 .name = "@ipc",
538                 .help = "SysV IPC, POSIX Message Queues or other IPC",
539                 .value =
540                 "ipc\0"
541                 "memfd_create\0"
542                 "mq_getsetattr\0"
543                 "mq_notify\0"
544                 "mq_open\0"
545                 "mq_timedreceive\0"
546                 "mq_timedreceive_time64\0"
547                 "mq_timedsend\0"
548                 "mq_timedsend_time64\0"
549                 "mq_unlink\0"
550                 "msgctl\0"
551                 "msgget\0"
552                 "msgrcv\0"
553                 "msgsnd\0"
554                 "pipe\0"
555                 "pipe2\0"
556                 "process_madvise\0"
557                 "process_vm_readv\0"
558                 "process_vm_writev\0"
559                 "semctl\0"
560                 "semget\0"
561                 "semop\0"
562                 "semtimedop\0"
563                 "semtimedop_time64\0"
564                 "shmat\0"
565                 "shmctl\0"
566                 "shmdt\0"
567                 "shmget\0"
568         },
569         [SYSCALL_FILTER_SET_KEYRING] = {
570                 .name = "@keyring",
571                 .help = "Kernel keyring access",
572                 .value =
573                 "add_key\0"
574                 "keyctl\0"
575                 "request_key\0"
576         },
577         [SYSCALL_FILTER_SET_MEMLOCK] = {
578                 .name = "@memlock",
579                 .help = "Memory locking control",
580                 .value =
581                 "mlock\0"
582                 "mlock2\0"
583                 "mlockall\0"
584                 "munlock\0"
585                 "munlockall\0"
586         },
587         [SYSCALL_FILTER_SET_MODULE] = {
588                 .name = "@module",
589                 .help = "Loading and unloading of kernel modules",
590                 .value =
591                 "delete_module\0"
592                 "finit_module\0"
593                 "init_module\0"
594         },
595         [SYSCALL_FILTER_SET_MOUNT] = {
596                 .name = "@mount",
597                 .help = "Mounting and unmounting of file systems",
598                 .value =
599                 "chroot\0"
600                 "fsconfig\0"
601                 "fsmount\0"
602                 "fsopen\0"
603                 "fspick\0"
604                 "mount\0"
605                 "mount_setattr\0"
606                 "move_mount\0"
607                 "open_tree\0"
608                 "pivot_root\0"
609                 "umount\0"
610                 "umount2\0"
611         },
612         [SYSCALL_FILTER_SET_NETWORK_IO] = {
613                 .name = "@network-io",
614                 .help = "Network or Unix socket IO, should not be needed if not network facing",
615                 .value =
616                 "accept\0"
617                 "accept4\0"
618                 "bind\0"
619                 "connect\0"
620                 "getpeername\0"
621                 "getsockname\0"
622                 "getsockopt\0"
623                 "listen\0"
624                 "recv\0"
625                 "recvfrom\0"
626                 "recvmmsg\0"
627                 "recvmmsg_time64\0"
628                 "recvmsg\0"
629                 "send\0"
630                 "sendmmsg\0"
631                 "sendmsg\0"
632                 "sendto\0"
633                 "setsockopt\0"
634                 "shutdown\0"
635                 "socket\0"
636                 "socketcall\0"
637                 "socketpair\0"
638         },
639         [SYSCALL_FILTER_SET_OBSOLETE] = {
640                 /* some unknown even to libseccomp */
641                 .name = "@obsolete",
642                 .help = "Unusual, obsolete or unimplemented system calls",
643                 .value =
644                 "_sysctl\0"
645                 "afs_syscall\0"
646                 "bdflush\0"
647                 "break\0"
648                 "create_module\0"
649                 "ftime\0"
650                 "get_kernel_syms\0"
651                 "getpmsg\0"
652                 "gtty\0"
653                 "idle\0"
654                 "lock\0"
655                 "mpx\0"
656                 "prof\0"
657                 "profil\0"
658                 "putpmsg\0"
659                 "query_module\0"
660                 "security\0"
661                 "sgetmask\0"
662                 "ssetmask\0"
663                 "stime\0"
664                 "stty\0"
665                 "sysfs\0"
666                 "tuxcall\0"
667                 "ulimit\0"
668                 "uselib\0"
669                 "ustat\0"
670                 "vserver\0"
671         },
672         [SYSCALL_FILTER_SET_PKEY] = {
673                 .name = "@pkey",
674                 .help = "System calls used for memory protection keys",
675                 .value =
676                 "pkey_alloc\0"
677                 "pkey_free\0"
678                 "pkey_mprotect\0"
679         },
680         [SYSCALL_FILTER_SET_PRIVILEGED] = {
681                 .name = "@privileged",
682                 .help = "All system calls which need super-user capabilities",
683                 .value =
684                 "@chown\0"
685                 "@clock\0"
686                 "@module\0"
687                 "@raw-io\0"
688                 "@reboot\0"
689                 "@swap\0"
690                 "_sysctl\0"
691                 "acct\0"
692                 "bpf\0"
693                 "capset\0"
694                 "chroot\0"
695                 "fanotify_init\0"
696                 "fanotify_mark\0"
697                 "nfsservctl\0"
698                 "open_by_handle_at\0"
699                 "pivot_root\0"
700                 "quotactl\0"
701                 "setdomainname\0"
702                 "setfsuid\0"
703                 "setfsuid32\0"
704                 "setgroups\0"
705                 "setgroups32\0"
706                 "sethostname\0"
707                 "setresuid\0"
708                 "setresuid32\0"
709                 "setreuid\0"
710                 "setreuid32\0"
711                 "setuid\0"      /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
712                 "setuid32\0"
713                 "vhangup\0"
714         },
715         [SYSCALL_FILTER_SET_PROCESS] = {
716                 .name = "@process",
717                 .help = "Process control, execution, namespacing operations",
718                 .value =
719                 "capget\0"      /* Able to query arbitrary processes */
720                 "clone\0"
721                 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
722                  * implement seccomp, so we don't need to list it at all. C.f.
723                  * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
724                 "clone3\0"
725                 "execveat\0"
726                 "fork\0"
727                 "getrusage\0"
728                 "kill\0"
729                 "pidfd_open\0"
730                 "pidfd_send_signal\0"
731                 "prctl\0"
732                 "rt_sigqueueinfo\0"
733                 "rt_tgsigqueueinfo\0"
734                 "setns\0"
735                 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
736                 "tgkill\0"
737                 "times\0"
738                 "tkill\0"
739                 "unshare\0"
740                 "vfork\0"
741                 "wait4\0"
742                 "waitid\0"
743                 "waitpid\0"
744         },
745         [SYSCALL_FILTER_SET_RAW_IO] = {
746                 .name = "@raw-io",
747                 .help = "Raw I/O port access",
748                 .value =
749                 "ioperm\0"
750                 "iopl\0"
751                 "pciconfig_iobase\0"
752                 "pciconfig_read\0"
753                 "pciconfig_write\0"
754 #if defined __s390__ || defined __s390x__
755                 "s390_pci_mmio_read\0"
756                 "s390_pci_mmio_write\0"
757 #endif
758         },
759         [SYSCALL_FILTER_SET_REBOOT] = {
760                 .name = "@reboot",
761                 .help = "Reboot and reboot preparation/kexec",
762                 .value =
763                 "kexec_file_load\0"
764                 "kexec_load\0"
765                 "reboot\0"
766         },
767         [SYSCALL_FILTER_SET_RESOURCES] = {
768                 .name = "@resources",
769                 .help = "Alter resource settings",
770                 .value =
771                 "ioprio_set\0"
772                 "mbind\0"
773                 "migrate_pages\0"
774                 "move_pages\0"
775                 "nice\0"
776                 "sched_setaffinity\0"
777                 "sched_setattr\0"
778                 "sched_setparam\0"
779                 "sched_setscheduler\0"
780                 "set_mempolicy\0"
781                 "setpriority\0"
782                 "setrlimit\0"
783         },
784         [SYSCALL_FILTER_SET_SETUID] = {
785                 .name = "@setuid",
786                 .help = "Operations for changing user/group credentials",
787                 .value =
788                 "setgid\0"
789                 "setgid32\0"
790                 "setgroups\0"
791                 "setgroups32\0"
792                 "setregid\0"
793                 "setregid32\0"
794                 "setresgid\0"
795                 "setresgid32\0"
796                 "setresuid\0"
797                 "setresuid32\0"
798                 "setreuid\0"
799                 "setreuid32\0"
800                 "setuid\0"
801                 "setuid32\0"
802         },
803         [SYSCALL_FILTER_SET_SIGNAL] = {
804                 .name = "@signal",
805                 .help = "Process signal handling",
806                 .value =
807                 "rt_sigaction\0"
808                 "rt_sigpending\0"
809                 "rt_sigprocmask\0"
810                 "rt_sigsuspend\0"
811                 "rt_sigtimedwait\0"
812                 "rt_sigtimedwait_time64\0"
813                 "sigaction\0"
814                 "sigaltstack\0"
815                 "signal\0"
816                 "signalfd\0"
817                 "signalfd4\0"
818                 "sigpending\0"
819                 "sigprocmask\0"
820                 "sigsuspend\0"
821         },
822         [SYSCALL_FILTER_SET_SWAP] = {
823                 .name = "@swap",
824                 .help = "Enable/disable swap devices",
825                 .value =
826                 "swapoff\0"
827                 "swapon\0"
828         },
829         [SYSCALL_FILTER_SET_SYNC] = {
830                 .name = "@sync",
831                 .help = "Synchronize files and memory to storage",
832                 .value =
833                 "fdatasync\0"
834                 "fsync\0"
835                 "msync\0"
836                 "sync\0"
837                 "sync_file_range\0"
838                 "sync_file_range2\0"
839                 "syncfs\0"
840         },
841         [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
842                 .name = "@system-service",
843                 .help = "General system service operations",
844                 .value =
845                 "@aio\0"
846                 "@basic-io\0"
847                 "@chown\0"
848                 "@default\0"
849                 "@file-system\0"
850                 "@io-event\0"
851                 "@ipc\0"
852                 "@keyring\0"
853                 "@memlock\0"
854                 "@network-io\0"
855                 "@process\0"
856                 "@resources\0"
857                 "@setuid\0"
858                 "@signal\0"
859                 "@sync\0"
860                 "@timer\0"
861                 "capget\0"
862                 "capset\0"
863                 "copy_file_range\0"
864                 "fadvise64\0"
865                 "fadvise64_64\0"
866                 "flock\0"
867                 "get_mempolicy\0"
868                 "getcpu\0"
869                 "getpriority\0"
870                 "ioctl\0"
871                 "ioprio_get\0"
872                 "kcmp\0"
873                 "madvise\0"
874                 "mremap\0"
875                 "name_to_handle_at\0"
876                 "oldolduname\0"
877                 "olduname\0"
878                 "personality\0"
879                 "readahead\0"
880                 "readdir\0"
881                 "remap_file_pages\0"
882                 "sched_get_priority_max\0"
883                 "sched_get_priority_min\0"
884                 "sched_getattr\0"
885                 "sched_getparam\0"
886                 "sched_getscheduler\0"
887                 "sched_rr_get_interval\0"
888                 "sched_rr_get_interval_time64\0"
889                 "sched_yield\0"
890                 "sendfile\0"
891                 "sendfile64\0"
892                 "setfsgid\0"
893                 "setfsgid32\0"
894                 "setfsuid\0"
895                 "setfsuid32\0"
896                 "setpgid\0"
897                 "setsid\0"
898                 "splice\0"
899                 "sysinfo\0"
900                 "tee\0"
901                 "umask\0"
902                 "uname\0"
903                 "userfaultfd\0"
904                 "vmsplice\0"
905         },
906         [SYSCALL_FILTER_SET_TIMER] = {
907                 .name = "@timer",
908                 .help = "Schedule operations by time",
909                 .value =
910                 "alarm\0"
911                 "getitimer\0"
912                 "setitimer\0"
913                 "timer_create\0"
914                 "timer_delete\0"
915                 "timer_getoverrun\0"
916                 "timer_gettime\0"
917                 "timer_gettime64\0"
918                 "timer_settime\0"
919                 "timer_settime64\0"
920                 "timerfd_create\0"
921                 "timerfd_gettime\0"
922                 "timerfd_gettime64\0"
923                 "timerfd_settime\0"
924                 "timerfd_settime64\0"
925                 "times\0"
926         },
927         [SYSCALL_FILTER_SET_KNOWN] = {
928                 .name = "@known",
929                 .help = "All known syscalls declared in the kernel",
930                 .value =
931 #include "syscall-list.h"
932         },
933 };
934 
syscall_filter_set_find(const char * name)935 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
936         if (isempty(name) || name[0] != '@')
937                 return NULL;
938 
939         for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
940                 if (streq(syscall_filter_sets[i].name, name))
941                         return syscall_filter_sets + i;
942 
943         return NULL;
944 }
945 
946 static int add_syscall_filter_set(
947                 scmp_filter_ctx seccomp,
948                 const SyscallFilterSet *set,
949                 uint32_t action,
950                 char **exclude,
951                 bool log_missing,
952                 char ***added);
953 
seccomp_add_syscall_filter_item(scmp_filter_ctx * seccomp,const char * name,uint32_t action,char ** exclude,bool log_missing,char *** added)954 int seccomp_add_syscall_filter_item(
955                 scmp_filter_ctx *seccomp,
956                 const char *name,
957                 uint32_t action,
958                 char **exclude,
959                 bool log_missing,
960                 char ***added) {
961 
962         assert(seccomp);
963         assert(name);
964 
965         if (strv_contains(exclude, name))
966                 return 0;
967 
968         /* Any syscalls that are handled are added to the *added strv. The pointer
969          * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
970 
971         if (name[0] == '@') {
972                 const SyscallFilterSet *other;
973 
974                 other = syscall_filter_set_find(name);
975                 if (!other)
976                         return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
977                                                "Filter set %s is not known!",
978                                                name);
979 
980                 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
981 
982         } else {
983                 int id, r;
984 
985                 id = seccomp_syscall_resolve_name(name);
986                 if (id == __NR_SCMP_ERROR) {
987                         if (log_missing)
988                                 log_debug("System call %s is not known, ignoring.", name);
989                         return 0;
990                 }
991 
992                 r = seccomp_rule_add_exact(seccomp, action, id, 0);
993                 if (r < 0) {
994                         /* If the system call is not known on this architecture, then that's fine, let's ignore it */
995                         bool ignore = r == -EDOM;
996 
997                         if (!ignore || log_missing)
998                                 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
999                                                 name, id, ignore ? ", ignoring" : "");
1000                         if (!ignore)
1001                                 return r;
1002                 }
1003 
1004                 if (added) {
1005                         r = strv_extend(added, name);
1006                         if (r < 0)
1007                                 return r;
1008                 }
1009 
1010                 return 0;
1011         }
1012 }
1013 
add_syscall_filter_set(scmp_filter_ctx seccomp,const SyscallFilterSet * set,uint32_t action,char ** exclude,bool log_missing,char *** added)1014 static int add_syscall_filter_set(
1015                 scmp_filter_ctx seccomp,
1016                 const SyscallFilterSet *set,
1017                 uint32_t action,
1018                 char **exclude,
1019                 bool log_missing,
1020                 char ***added) {
1021 
1022         const char *sys;
1023         int r;
1024 
1025         /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1026 
1027         assert(seccomp);
1028         assert(set);
1029 
1030         NULSTR_FOREACH(sys, set->value) {
1031                 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1032                 if (r < 0)
1033                         return r;
1034         }
1035 
1036         return 0;
1037 }
1038 
seccomp_load_syscall_filter_set(uint32_t default_action,const SyscallFilterSet * set,uint32_t action,bool log_missing)1039 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1040         uint32_t arch;
1041         int r;
1042 
1043         assert(set);
1044 
1045         /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1046          * each local arch. */
1047 
1048         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1049                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1050 
1051                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1052 
1053                 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1054                 if (r < 0)
1055                         return r;
1056 
1057                 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1058                 if (r < 0)
1059                         return log_debug_errno(r, "Failed to add filter set: %m");
1060 
1061                 r = seccomp_load(seccomp);
1062                 if (ERRNO_IS_SECCOMP_FATAL(r))
1063                         return r;
1064                 if (r < 0)
1065                         log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1066         }
1067 
1068         return 0;
1069 }
1070 
seccomp_load_syscall_filter_set_raw(uint32_t default_action,Hashmap * filter,uint32_t action,bool log_missing)1071 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1072         uint32_t arch;
1073         int r;
1074 
1075         /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1076          * of a SyscallFilterSet* table. */
1077 
1078         if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1079                 return 0;
1080 
1081         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1082                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1083                 void *syscall_id, *val;
1084 
1085                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1086 
1087                 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1088                 if (r < 0)
1089                         return r;
1090 
1091                 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1092                         uint32_t a = action;
1093                         int id = PTR_TO_INT(syscall_id) - 1;
1094                         int error = PTR_TO_INT(val);
1095 
1096                         if (error == SECCOMP_ERROR_NUMBER_KILL)
1097                                 a = scmp_act_kill_process();
1098 #ifdef SCMP_ACT_LOG
1099                         else if (action == SCMP_ACT_LOG)
1100                                 a = SCMP_ACT_LOG;
1101 #endif
1102                         else if (error >= 0)
1103                                 a = SCMP_ACT_ERRNO(error);
1104 
1105                         r = seccomp_rule_add_exact(seccomp, a, id, 0);
1106                         if (r < 0) {
1107                                 /* If the system call is not known on this architecture, then that's
1108                                  * fine, let's ignore it */
1109                                 _cleanup_free_ char *n = NULL;
1110                                 bool ignore;
1111 
1112                                 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1113                                 ignore = r == -EDOM;
1114                                 if (!ignore || log_missing)
1115                                         log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1116                                                         strna(n), id, ignore ? ", ignoring" : "");
1117                                 if (!ignore)
1118                                         return r;
1119                         }
1120                 }
1121 
1122                 r = seccomp_load(seccomp);
1123                 if (ERRNO_IS_SECCOMP_FATAL(r))
1124                         return r;
1125                 if (r < 0)
1126                         log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
1127                                         seccomp_arch_to_string(arch));
1128         }
1129 
1130         return 0;
1131 }
1132 
seccomp_parse_syscall_filter(const char * name,int errno_num,Hashmap * filter,SeccompParseFlags flags,const char * unit,const char * filename,unsigned line)1133 int seccomp_parse_syscall_filter(
1134                 const char *name,
1135                 int errno_num,
1136                 Hashmap *filter,
1137                 SeccompParseFlags flags,
1138                 const char *unit,
1139                 const char *filename,
1140                 unsigned line) {
1141 
1142         int r;
1143 
1144         assert(name);
1145         assert(filter);
1146 
1147         if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1148                 return -EINVAL;
1149 
1150         if (name[0] == '@') {
1151                 const SyscallFilterSet *set;
1152                 const char *i;
1153 
1154                 set = syscall_filter_set_find(name);
1155                 if (!set) {
1156                         if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1157                                 return -EINVAL;
1158 
1159                         log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1160                                    "Unknown system call group, ignoring: %s", name);
1161                         return 0;
1162                 }
1163 
1164                 NULSTR_FOREACH(i, set->value) {
1165                         /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1166                          * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1167                          * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1168                          * about them. */
1169                         r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1170                         if (r < 0)
1171                                 return r;
1172                 }
1173         } else {
1174                 int id;
1175 
1176                 id = seccomp_syscall_resolve_name(name);
1177                 if (id == __NR_SCMP_ERROR) {
1178                         if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1179                                 return -EINVAL;
1180 
1181                         log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1182                                    "Failed to parse system call, ignoring: %s", name);
1183                         return 0;
1184                 }
1185 
1186                 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1187                  * it from the list. The entries in allow-list with non-negative error value will be
1188                  * handled with SCMP_ACT_ERRNO() instead of the default action. */
1189                 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1190                     (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1191                         r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1192                         if (r < 0)
1193                                 switch (r) {
1194                                 case -ENOMEM:
1195                                         return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1196                                 case -EEXIST:
1197                                         assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1198                                         break;
1199                                 default:
1200                                         return r;
1201                                 }
1202                 } else
1203                         (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1204         }
1205 
1206         return 0;
1207 }
1208 
seccomp_restrict_namespaces(unsigned long retain)1209 int seccomp_restrict_namespaces(unsigned long retain) {
1210         uint32_t arch;
1211         int r;
1212 
1213         if (DEBUG_LOGGING) {
1214                 _cleanup_free_ char *s = NULL;
1215 
1216                 (void) namespace_flags_to_string(retain, &s);
1217                 log_debug("Restricting namespace to: %s.", strna(s));
1218         }
1219 
1220         /* NOOP? */
1221         if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1222                 return 0;
1223 
1224         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1225                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1226 
1227                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1228 
1229                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1230                 if (r < 0)
1231                         return r;
1232 
1233                 /* We cannot filter on individual flags to clone3(), and we need to disable the
1234                  * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1235                  * users shall fall back to clone(), as if on an older kernel.
1236                  *
1237                  * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1238                  * https://github.com/moby/moby/issues/42680. */
1239 
1240                 r = seccomp_rule_add_exact(
1241                                 seccomp,
1242                                 SCMP_ACT_ERRNO(ENOSYS),
1243                                 SCMP_SYS(clone3),
1244                                 0);
1245                 if (r < 0)
1246                         log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1247 
1248                 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1249                         /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1250                          * altogether. */
1251                         r = seccomp_rule_add_exact(
1252                                         seccomp,
1253                                         SCMP_ACT_ERRNO(EPERM),
1254                                         SCMP_SYS(setns),
1255                                         0);
1256                 else
1257                         /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1258                          * special invocation with a zero flags argument, right here. */
1259                         r = seccomp_rule_add_exact(
1260                                         seccomp,
1261                                         SCMP_ACT_ERRNO(EPERM),
1262                                         SCMP_SYS(setns),
1263                                         1,
1264                                         SCMP_A1(SCMP_CMP_EQ, 0));
1265                 if (r < 0) {
1266                         log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1267                         continue;
1268                 }
1269 
1270                 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1271                         unsigned long f;
1272 
1273                         f = namespace_flag_map[i].flag;
1274                         if (FLAGS_SET(retain, f)) {
1275                                 log_debug("Permitting %s.", namespace_flag_map[i].name);
1276                                 continue;
1277                         }
1278 
1279                         log_debug("Blocking %s.", namespace_flag_map[i].name);
1280 
1281                         r = seccomp_rule_add_exact(
1282                                         seccomp,
1283                                         SCMP_ACT_ERRNO(EPERM),
1284                                         SCMP_SYS(unshare),
1285                                         1,
1286                                         SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1287                         if (r < 0) {
1288                                 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1289                                 break;
1290                         }
1291 
1292                         /* On s390/s390x the first two parameters to clone are switched */
1293                         if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1294                                 r = seccomp_rule_add_exact(
1295                                                 seccomp,
1296                                                 SCMP_ACT_ERRNO(EPERM),
1297                                                 SCMP_SYS(clone),
1298                                                 1,
1299                                                 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1300                         else
1301                                 r = seccomp_rule_add_exact(
1302                                                 seccomp,
1303                                                 SCMP_ACT_ERRNO(EPERM),
1304                                                 SCMP_SYS(clone),
1305                                                 1,
1306                                                 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1307                         if (r < 0) {
1308                                 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1309                                 break;
1310                         }
1311 
1312                         if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1313                                 r = seccomp_rule_add_exact(
1314                                                 seccomp,
1315                                                 SCMP_ACT_ERRNO(EPERM),
1316                                                 SCMP_SYS(setns),
1317                                                 1,
1318                                                 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1319                                 if (r < 0) {
1320                                         log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1321                                         break;
1322                                 }
1323                         }
1324                 }
1325                 if (r < 0)
1326                         continue;
1327 
1328                 r = seccomp_load(seccomp);
1329                 if (ERRNO_IS_SECCOMP_FATAL(r))
1330                         return r;
1331                 if (r < 0)
1332                         log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333         }
1334 
1335         return 0;
1336 }
1337 
seccomp_protect_sysctl(void)1338 int seccomp_protect_sysctl(void) {
1339         uint32_t arch;
1340         int r;
1341 
1342         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1343                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1344 
1345                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1346 
1347                 if (IN_SET(arch,
1348                            SCMP_ARCH_AARCH64,
1349 #ifdef SCMP_ARCH_RISCV64
1350                            SCMP_ARCH_RISCV64,
1351 #endif
1352                            SCMP_ARCH_X32
1353                           ))
1354                         /* No _sysctl syscall */
1355                         continue;
1356 
1357                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1358                 if (r < 0)
1359                         return r;
1360 
1361                 r = seccomp_rule_add_exact(
1362                                 seccomp,
1363                                 SCMP_ACT_ERRNO(EPERM),
1364                                 SCMP_SYS(_sysctl),
1365                                 0);
1366                 if (r < 0) {
1367                         log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1368                         continue;
1369                 }
1370 
1371                 r = seccomp_load(seccomp);
1372                 if (ERRNO_IS_SECCOMP_FATAL(r))
1373                         return r;
1374                 if (r < 0)
1375                         log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1376         }
1377 
1378         return 0;
1379 }
1380 
seccomp_protect_syslog(void)1381 int seccomp_protect_syslog(void) {
1382         uint32_t arch;
1383         int r;
1384 
1385         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1386                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1387 
1388                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1389                 if (r < 0)
1390                         return r;
1391 
1392                 r = seccomp_rule_add_exact(
1393                                 seccomp,
1394                                 SCMP_ACT_ERRNO(EPERM),
1395                                 SCMP_SYS(syslog),
1396                                 0);
1397 
1398                 if (r < 0) {
1399                         log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1400                         continue;
1401                 }
1402 
1403                 r = seccomp_load(seccomp);
1404                 if (ERRNO_IS_SECCOMP_FATAL(r))
1405                         return r;
1406                 if (r < 0)
1407                         log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1408         }
1409 
1410         return 0;
1411 }
1412 
seccomp_restrict_address_families(Set * address_families,bool allow_list)1413 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1414         uint32_t arch;
1415         int r;
1416 
1417         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1418                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1419                 bool supported;
1420 
1421                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1422 
1423                 switch (arch) {
1424 
1425                 case SCMP_ARCH_X86_64:
1426                 case SCMP_ARCH_X32:
1427                 case SCMP_ARCH_ARM:
1428                 case SCMP_ARCH_AARCH64:
1429                 case SCMP_ARCH_MIPSEL64N32:
1430                 case SCMP_ARCH_MIPS64N32:
1431                 case SCMP_ARCH_MIPSEL64:
1432                 case SCMP_ARCH_MIPS64:
1433 #ifdef SCMP_ARCH_RISCV64
1434                 case SCMP_ARCH_RISCV64:
1435 #endif
1436                         /* These we know we support (i.e. are the ones that do not use socketcall()) */
1437                         supported = true;
1438                         break;
1439 
1440                 case SCMP_ARCH_S390:
1441                 case SCMP_ARCH_S390X:
1442                 case SCMP_ARCH_X86:
1443                 case SCMP_ARCH_MIPSEL:
1444                 case SCMP_ARCH_MIPS:
1445                 case SCMP_ARCH_PPC:
1446                 case SCMP_ARCH_PPC64:
1447                 case SCMP_ARCH_PPC64LE:
1448                 default:
1449                         /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1450                          * don't know */
1451                         supported = false;
1452                         break;
1453                 }
1454 
1455                 if (!supported)
1456                         continue;
1457 
1458                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1459                 if (r < 0)
1460                         return r;
1461 
1462                 if (allow_list) {
1463                         int first = 0, last = 0;
1464                         void *afp;
1465 
1466                         /* If this is an allow list, we first block the address families that are out of
1467                          * range and then everything that is not in the set. First, we find the lowest and
1468                          * highest address family in the set. */
1469 
1470                         SET_FOREACH(afp, address_families) {
1471                                 int af = PTR_TO_INT(afp);
1472 
1473                                 if (af <= 0 || af >= af_max())
1474                                         continue;
1475 
1476                                 if (first == 0 || af < first)
1477                                         first = af;
1478 
1479                                 if (last == 0 || af > last)
1480                                         last = af;
1481                         }
1482 
1483                         assert((first == 0) == (last == 0));
1484 
1485                         if (first == 0) {
1486 
1487                                 /* No entries in the valid range, block everything */
1488                                 r = seccomp_rule_add_exact(
1489                                                 seccomp,
1490                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1491                                                 SCMP_SYS(socket),
1492                                                 0);
1493                                 if (r < 0) {
1494                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1495                                         continue;
1496                                 }
1497 
1498                         } else {
1499 
1500                                 /* Block everything below the first entry */
1501                                 r = seccomp_rule_add_exact(
1502                                                 seccomp,
1503                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1504                                                 SCMP_SYS(socket),
1505                                                 1,
1506                                                 SCMP_A0(SCMP_CMP_LT, first));
1507                                 if (r < 0) {
1508                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1509                                         continue;
1510                                 }
1511 
1512                                 /* Block everything above the last entry */
1513                                 r = seccomp_rule_add_exact(
1514                                                 seccomp,
1515                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1516                                                 SCMP_SYS(socket),
1517                                                 1,
1518                                                 SCMP_A0(SCMP_CMP_GT, last));
1519                                 if (r < 0) {
1520                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1521                                         continue;
1522                                 }
1523 
1524                                 /* Block everything between the first and last entry */
1525                                 for (int af = 1; af < af_max(); af++) {
1526 
1527                                         if (set_contains(address_families, INT_TO_PTR(af)))
1528                                                 continue;
1529 
1530                                         r = seccomp_rule_add_exact(
1531                                                         seccomp,
1532                                                         SCMP_ACT_ERRNO(EAFNOSUPPORT),
1533                                                         SCMP_SYS(socket),
1534                                                         1,
1535                                                         SCMP_A0(SCMP_CMP_EQ, af));
1536                                         if (r < 0)
1537                                                 break;
1538                                 }
1539                                 if (r < 0) {
1540                                         log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1541                                         continue;
1542                                 }
1543                         }
1544 
1545                 } else {
1546                         void *af;
1547 
1548                         /* If this is a deny list, then generate one rule for each address family that are
1549                          * then combined in OR checks. */
1550 
1551                         SET_FOREACH(af, address_families) {
1552                                 r = seccomp_rule_add_exact(
1553                                                 seccomp,
1554                                                 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1555                                                 SCMP_SYS(socket),
1556                                                 1,
1557                                                 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1558                                 if (r < 0)
1559                                         break;
1560                         }
1561                         if (r < 0) {
1562                                 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1563                                 continue;
1564                         }
1565                 }
1566 
1567                 r = seccomp_load(seccomp);
1568                 if (ERRNO_IS_SECCOMP_FATAL(r))
1569                         return r;
1570                 if (r < 0)
1571                         log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1572         }
1573 
1574         return 0;
1575 }
1576 
seccomp_restrict_realtime(void)1577 int seccomp_restrict_realtime(void) {
1578         static const int permitted_policies[] = {
1579                 SCHED_OTHER,
1580                 SCHED_BATCH,
1581                 SCHED_IDLE,
1582         };
1583 
1584         int r, max_policy = 0;
1585         uint32_t arch;
1586         unsigned i;
1587 
1588         /* Determine the highest policy constant we want to allow */
1589         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1590                 if (permitted_policies[i] > max_policy)
1591                         max_policy = permitted_policies[i];
1592 
1593         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1594                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1595                 int p;
1596 
1597                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1598 
1599                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1600                 if (r < 0)
1601                         return r;
1602 
1603                 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1604                  * allow list. */
1605                 for (p = 0; p < max_policy; p++) {
1606                         bool good = false;
1607 
1608                         /* Check if this is in the allow list. */
1609                         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1610                                 if (permitted_policies[i] == p) {
1611                                         good = true;
1612                                         break;
1613                                 }
1614 
1615                         if (good)
1616                                 continue;
1617 
1618                         /* Deny this policy */
1619                         r = seccomp_rule_add_exact(
1620                                         seccomp,
1621                                         SCMP_ACT_ERRNO(EPERM),
1622                                         SCMP_SYS(sched_setscheduler),
1623                                         1,
1624                                         SCMP_A1(SCMP_CMP_EQ, p));
1625                         if (r < 0) {
1626                                 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1627                                 continue;
1628                         }
1629                 }
1630 
1631                 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1632                  * are unsigned here, hence no need no check for < 0 values. */
1633                 r = seccomp_rule_add_exact(
1634                                 seccomp,
1635                                 SCMP_ACT_ERRNO(EPERM),
1636                                 SCMP_SYS(sched_setscheduler),
1637                                 1,
1638                                 SCMP_A1(SCMP_CMP_GT, max_policy));
1639                 if (r < 0) {
1640                         log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1641                         continue;
1642                 }
1643 
1644                 r = seccomp_load(seccomp);
1645                 if (ERRNO_IS_SECCOMP_FATAL(r))
1646                         return r;
1647                 if (r < 0)
1648                         log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1649         }
1650 
1651         return 0;
1652 }
1653 
add_seccomp_syscall_filter(scmp_filter_ctx seccomp,uint32_t arch,int nr,unsigned arg_cnt,const struct scmp_arg_cmp arg)1654 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1655                                       uint32_t arch,
1656                                       int nr,
1657                                       unsigned arg_cnt,
1658                                       const struct scmp_arg_cmp arg) {
1659         int r;
1660 
1661         r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1662         if (r < 0) {
1663                 _cleanup_free_ char *n = NULL;
1664 
1665                 n = seccomp_syscall_resolve_num_arch(arch, nr);
1666                 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1667                                 strna(n),
1668                                 seccomp_arch_to_string(arch));
1669         }
1670 
1671         return r;
1672 }
1673 
1674 /* For known architectures, check that syscalls are indeed defined or not. */
1675 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1676 assert_cc(SCMP_SYS(shmget) > 0);
1677 assert_cc(SCMP_SYS(shmat) > 0);
1678 assert_cc(SCMP_SYS(shmdt) > 0);
1679 #endif
1680 
seccomp_memory_deny_write_execute(void)1681 int seccomp_memory_deny_write_execute(void) {
1682         uint32_t arch;
1683         unsigned loaded = 0;
1684 
1685         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1686                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1687                 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1688 
1689                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1690 
1691                 switch (arch) {
1692 
1693                 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1694                  * We ignore that here, which means there's still a way to get writable/executable
1695                  * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1696 
1697                 case SCMP_ARCH_X86:
1698                 case SCMP_ARCH_S390:
1699                         filter_syscall = SCMP_SYS(mmap2);
1700                         block_syscall = SCMP_SYS(mmap);
1701                         /* shmat multiplexed, see above */
1702                         break;
1703 
1704                 case SCMP_ARCH_PPC:
1705                 case SCMP_ARCH_PPC64:
1706                 case SCMP_ARCH_PPC64LE:
1707                 case SCMP_ARCH_S390X:
1708                         filter_syscall = SCMP_SYS(mmap);
1709                         /* shmat multiplexed, see above */
1710                         break;
1711 
1712                 case SCMP_ARCH_ARM:
1713                         filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1714                         shmat_syscall = SCMP_SYS(shmat);
1715                         break;
1716 
1717                 case SCMP_ARCH_X86_64:
1718                 case SCMP_ARCH_X32:
1719                 case SCMP_ARCH_AARCH64:
1720 #ifdef SCMP_ARCH_RISCV64
1721                 case SCMP_ARCH_RISCV64:
1722 #endif
1723                         filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1724                         shmat_syscall = SCMP_SYS(shmat);
1725                         break;
1726 
1727                 /* Please add more definitions here, if you port systemd to other architectures! */
1728 
1729 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1730 #warning "Consider adding the right mmap() syscall definitions here!"
1731 #endif
1732                 }
1733 
1734                 /* Can't filter mmap() on this arch, then skip it */
1735                 if (filter_syscall == 0)
1736                         continue;
1737 
1738                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1739                 if (r < 0)
1740                         return r;
1741 
1742                 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1743                                                1,
1744                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1745                 if (r < 0)
1746                         continue;
1747 
1748                 if (block_syscall != 0) {
1749                         r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1750                         if (r < 0)
1751                                 continue;
1752                 }
1753 
1754                 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1755                                                1,
1756                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1757                 if (r < 0)
1758                         continue;
1759 
1760                 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1761                                                1,
1762                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1763                 if (r < 0)
1764                         continue;
1765 
1766                 if (shmat_syscall > 0) {
1767                         r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1768                                                        1,
1769                                                        SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1770                         if (r < 0)
1771                                 continue;
1772                 }
1773 
1774                 r = seccomp_load(seccomp);
1775                 if (ERRNO_IS_SECCOMP_FATAL(r))
1776                         return r;
1777                 if (r < 0)
1778                         log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1779                                         seccomp_arch_to_string(arch));
1780                 loaded++;
1781         }
1782 
1783         if (loaded == 0)
1784                 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1785 
1786         return loaded;
1787 }
1788 
seccomp_restrict_archs(Set * archs)1789 int seccomp_restrict_archs(Set *archs) {
1790         _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1791         int r;
1792         bool blocked_new = false;
1793 
1794         /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1795          * list.
1796          *
1797          * There are some qualifications. However the most important use is to stop processes from bypassing
1798          * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1799          * in a non-native architecture. There are no holes in this use case, at least so far. */
1800 
1801         /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1802          * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1803          * to run a program with the restrictions applied. */
1804         seccomp = seccomp_init(SCMP_ACT_ALLOW);
1805         if (!seccomp)
1806                 return -ENOMEM;
1807 
1808         for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1809                 uint32_t arch = seccomp_local_archs[i];
1810 
1811                 /* See above comment, our "native" architecture is never blocked. */
1812                 if (arch == seccomp_arch_native())
1813                         continue;
1814 
1815                 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1816                 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1817                         continue;
1818 
1819                 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1820 
1821                 /* The vdso for x32 assumes that x86-64 syscalls are available.  Let's allow them, since x32
1822                  * x32 syscalls should basically match x86-64 for everything except the pointer type.
1823                  * The important thing is that you can block the old 32-bit x86 syscalls.
1824                  * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1825                 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1826                         block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1827 
1828                 if (block) {
1829                         seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1830                         blocked_new = true;
1831                 } else {
1832                         r = seccomp_arch_add(seccomp, arch);
1833                         if (r < 0 && r != -EEXIST)
1834                                 return r;
1835                 }
1836         }
1837 
1838         /* All architectures that will be blocked by the seccomp program were
1839          * already blocked. */
1840         if (!blocked_new)
1841                 return 0;
1842 
1843         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1844         if (r < 0)
1845                 return r;
1846 
1847         r = seccomp_load(seccomp);
1848         if (ERRNO_IS_SECCOMP_FATAL(r))
1849                 return r;
1850         if (r < 0)
1851                 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1852 
1853         return 0;
1854 }
1855 
parse_syscall_archs(char ** l,Set ** ret_archs)1856 int parse_syscall_archs(char **l, Set **ret_archs) {
1857         _cleanup_set_free_ Set *archs = NULL;
1858         int r;
1859 
1860         assert(l);
1861         assert(ret_archs);
1862 
1863         STRV_FOREACH(s, l) {
1864                 uint32_t a;
1865 
1866                 r = seccomp_arch_from_string(*s, &a);
1867                 if (r < 0)
1868                         return -EINVAL;
1869 
1870                 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1871                 if (r < 0)
1872                         return -ENOMEM;
1873         }
1874 
1875         *ret_archs = TAKE_PTR(archs);
1876         return 0;
1877 }
1878 
seccomp_filter_set_add(Hashmap * filter,bool add,const SyscallFilterSet * set)1879 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1880         const char *i;
1881         int r;
1882 
1883         assert(set);
1884 
1885         NULSTR_FOREACH(i, set->value) {
1886 
1887                 if (i[0] == '@') {
1888                         const SyscallFilterSet *more;
1889 
1890                         more = syscall_filter_set_find(i);
1891                         if (!more)
1892                                 return -ENXIO;
1893 
1894                         r = seccomp_filter_set_add(filter, add, more);
1895                         if (r < 0)
1896                                 return r;
1897                 } else {
1898                         int id;
1899 
1900                         id = seccomp_syscall_resolve_name(i);
1901                         if (id == __NR_SCMP_ERROR) {
1902                                 log_debug("Couldn't resolve system call, ignoring: %s", i);
1903                                 continue;
1904                         }
1905 
1906                         if (add) {
1907                                 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1908                                 if (r < 0)
1909                                         return r;
1910                         } else
1911                                 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1912                 }
1913         }
1914 
1915         return 0;
1916 }
1917 
seccomp_lock_personality(unsigned long personality)1918 int seccomp_lock_personality(unsigned long personality) {
1919         uint32_t arch;
1920         int r;
1921 
1922         if (personality >= PERSONALITY_INVALID)
1923                 return -EINVAL;
1924 
1925         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1926                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1927 
1928                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1929                 if (r < 0)
1930                         return r;
1931 
1932                 r = seccomp_rule_add_exact(
1933                                 seccomp,
1934                                 SCMP_ACT_ERRNO(EPERM),
1935                                 SCMP_SYS(personality),
1936                                 1,
1937                                 SCMP_A0(SCMP_CMP_NE, personality));
1938                 if (r < 0) {
1939                         log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1940                         continue;
1941                 }
1942 
1943                 r = seccomp_load(seccomp);
1944                 if (ERRNO_IS_SECCOMP_FATAL(r))
1945                         return r;
1946                 if (r < 0)
1947                         log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1948         }
1949 
1950         return 0;
1951 }
1952 
seccomp_protect_hostname(void)1953 int seccomp_protect_hostname(void) {
1954         uint32_t arch;
1955         int r;
1956 
1957         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1958                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1959 
1960                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1961                 if (r < 0)
1962                         return r;
1963 
1964                 r = seccomp_rule_add_exact(
1965                                 seccomp,
1966                                 SCMP_ACT_ERRNO(EPERM),
1967                                 SCMP_SYS(sethostname),
1968                                 0);
1969                 if (r < 0) {
1970                         log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1971                         continue;
1972                 }
1973 
1974                 r = seccomp_rule_add_exact(
1975                                 seccomp,
1976                                 SCMP_ACT_ERRNO(EPERM),
1977                                 SCMP_SYS(setdomainname),
1978                                 0);
1979                 if (r < 0) {
1980                         log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1981                         continue;
1982                 }
1983 
1984                 r = seccomp_load(seccomp);
1985                 if (ERRNO_IS_SECCOMP_FATAL(r))
1986                         return r;
1987                 if (r < 0)
1988                         log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1989         }
1990 
1991         return 0;
1992 }
1993 
seccomp_restrict_sxid(scmp_filter_ctx seccomp,mode_t m)1994 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1995         /* Checks the mode_t parameter of the following system calls:
1996          *
1997          *       → chmod() + fchmod() + fchmodat()
1998          *       → open() + creat() + openat()
1999          *       → mkdir() + mkdirat()
2000          *       → mknod() + mknodat()
2001          *
2002          * Returns error if *everything* failed, and 0 otherwise.
2003          */
2004         int r;
2005         bool any = false;
2006 
2007         r = seccomp_rule_add_exact(
2008                         seccomp,
2009                         SCMP_ACT_ERRNO(EPERM),
2010                         SCMP_SYS(chmod),
2011                         1,
2012                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2013         if (r < 0)
2014                 log_debug_errno(r, "Failed to add filter for chmod: %m");
2015         else
2016                 any = true;
2017 
2018         r = seccomp_rule_add_exact(
2019                         seccomp,
2020                         SCMP_ACT_ERRNO(EPERM),
2021                         SCMP_SYS(fchmod),
2022                         1,
2023                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2024         if (r < 0)
2025                 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2026         else
2027                 any = true;
2028 
2029         r = seccomp_rule_add_exact(
2030                         seccomp,
2031                         SCMP_ACT_ERRNO(EPERM),
2032                         SCMP_SYS(fchmodat),
2033                         1,
2034                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2035         if (r < 0)
2036                 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2037         else
2038                 any = true;
2039 
2040         r = seccomp_rule_add_exact(
2041                         seccomp,
2042                         SCMP_ACT_ERRNO(EPERM),
2043                         SCMP_SYS(mkdir),
2044                         1,
2045                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2046         if (r < 0)
2047                 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2048         else
2049                 any = true;
2050 
2051         r = seccomp_rule_add_exact(
2052                         seccomp,
2053                         SCMP_ACT_ERRNO(EPERM),
2054                         SCMP_SYS(mkdirat),
2055                         1,
2056                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2057         if (r < 0)
2058                 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2059         else
2060                 any = true;
2061 
2062         r = seccomp_rule_add_exact(
2063                         seccomp,
2064                         SCMP_ACT_ERRNO(EPERM),
2065                         SCMP_SYS(mknod),
2066                         1,
2067                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2068         if (r < 0)
2069                 log_debug_errno(r, "Failed to add filter for mknod: %m");
2070         else
2071                 any = true;
2072 
2073         r = seccomp_rule_add_exact(
2074                         seccomp,
2075                         SCMP_ACT_ERRNO(EPERM),
2076                         SCMP_SYS(mknodat),
2077                         1,
2078                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2079         if (r < 0)
2080                 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2081         else
2082                 any = true;
2083 
2084         r = seccomp_rule_add_exact(
2085                         seccomp,
2086                         SCMP_ACT_ERRNO(EPERM),
2087                         SCMP_SYS(open),
2088                         2,
2089                         SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2090                         SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2091         if (r < 0)
2092                 log_debug_errno(r, "Failed to add filter for open: %m");
2093         else
2094                 any = true;
2095 
2096         r = seccomp_rule_add_exact(
2097                         seccomp,
2098                         SCMP_ACT_ERRNO(EPERM),
2099                         SCMP_SYS(openat),
2100                         2,
2101                         SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2102                         SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2103         if (r < 0)
2104                 log_debug_errno(r, "Failed to add filter for openat: %m");
2105         else
2106                 any = true;
2107 
2108 #if defined(__SNR_openat2)
2109         /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2110          * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2111          * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2112          * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2113          * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2114          * to call open() or openat() instead. We can properly enforce policy for those functions. */
2115         r = seccomp_rule_add_exact(
2116                         seccomp,
2117                         SCMP_ACT_ERRNO(ENOSYS),
2118                         SCMP_SYS(openat2),
2119                         0);
2120         if (r < 0)
2121                 log_debug_errno(r, "Failed to add filter for openat2: %m");
2122         else
2123                 any = true;
2124 #endif
2125 
2126         r = seccomp_rule_add_exact(
2127                         seccomp,
2128                         SCMP_ACT_ERRNO(EPERM),
2129                         SCMP_SYS(creat),
2130                         1,
2131                         SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2132         if (r < 0)
2133                 log_debug_errno(r, "Failed to add filter for creat: %m");
2134         else
2135                 any = true;
2136 
2137         return any ? 0 : r;
2138 }
2139 
seccomp_restrict_suid_sgid(void)2140 int seccomp_restrict_suid_sgid(void) {
2141         uint32_t arch;
2142         int r, k;
2143 
2144         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2145                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2146 
2147                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2148                 if (r < 0)
2149                         return r;
2150 
2151                 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2152                 if (r < 0)
2153                         log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2154 
2155                 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2156                 if (k < 0)
2157                         log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2158 
2159                 if (r < 0 && k < 0)
2160                         continue;
2161 
2162                 r = seccomp_load(seccomp);
2163                 if (ERRNO_IS_SECCOMP_FATAL(r))
2164                         return r;
2165                 if (r < 0)
2166                         log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2167         }
2168 
2169         return 0;
2170 }
2171 
scmp_act_kill_process(void)2172 uint32_t scmp_act_kill_process(void) {
2173 
2174         /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2175          * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2176          * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2177          * for single-threaded apps does the right thing. */
2178 
2179 #ifdef SCMP_ACT_KILL_PROCESS
2180         if (seccomp_api_get() >= 3)
2181                 return SCMP_ACT_KILL_PROCESS;
2182 #endif
2183 
2184         return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2185 }
2186 
parse_syscall_and_errno(const char * in,char ** name,int * error)2187 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2188         _cleanup_free_ char *n = NULL;
2189         char *p;
2190         int e = -1;
2191 
2192         assert(in);
2193         assert(name);
2194         assert(error);
2195 
2196         /*
2197          * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2198          * If errno is omitted, then error is set to -1.
2199          * Empty syscall name is not allowed.
2200          * Here, we do not check that the syscall name is valid or not.
2201          */
2202 
2203         p = strchr(in, ':');
2204         if (p) {
2205                 e = seccomp_parse_errno_or_action(p + 1);
2206                 if (e < 0)
2207                         return e;
2208 
2209                 n = strndup(in, p - in);
2210         } else
2211                 n = strdup(in);
2212 
2213         if (!n)
2214                 return -ENOMEM;
2215 
2216         if (isempty(n))
2217                 return -EINVAL;
2218 
2219         *error = e;
2220         *name = TAKE_PTR(n);
2221 
2222         return 0;
2223 }
2224 
block_open_flag(scmp_filter_ctx seccomp,int flag)2225 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2226         bool any = false;
2227         int r;
2228 
2229         /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2230          * EINVAL, in the hope the client code will retry without O_SYNC then.  */
2231 
2232         r = seccomp_rule_add_exact(
2233                         seccomp,
2234                         SCMP_ACT_ERRNO(EINVAL),
2235                         SCMP_SYS(open),
2236                         1,
2237                         SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2238         if (r < 0)
2239                 log_debug_errno(r, "Failed to add filter for open: %m");
2240         else
2241                 any = true;
2242 
2243         r = seccomp_rule_add_exact(
2244                         seccomp,
2245                         SCMP_ACT_ERRNO(EINVAL),
2246                         SCMP_SYS(openat),
2247                         1,
2248                         SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2249         if (r < 0)
2250                 log_debug_errno(r, "Failed to add filter for openat: %m");
2251         else
2252                 any = true;
2253 
2254 #if defined(__SNR_openat2)
2255         /* The new openat2() system call can't be filtered sensibly, see above. */
2256         r = seccomp_rule_add_exact(
2257                         seccomp,
2258                         SCMP_ACT_ERRNO(ENOSYS),
2259                         SCMP_SYS(openat2),
2260                         0);
2261         if (r < 0)
2262                 log_debug_errno(r, "Failed to add filter for openat2: %m");
2263         else
2264                 any = true;
2265 #endif
2266 
2267         return any ? 0 : r;
2268 }
2269 
seccomp_suppress_sync(void)2270 int seccomp_suppress_sync(void) {
2271         uint32_t arch;
2272         int r;
2273 
2274         /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2275          * manageable, and also masks O_SYNC/O_DSYNC */
2276 
2277         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2278                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2279                 const char *c;
2280 
2281                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2282                 if (r < 0)
2283                         return r;
2284 
2285                 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2286                         int id;
2287 
2288                         id = seccomp_syscall_resolve_name(c);
2289                         if (id == __NR_SCMP_ERROR) {
2290                                 log_debug("System call %s is not known, ignoring.", c);
2291                                 continue;
2292                         }
2293 
2294                         r = seccomp_rule_add_exact(
2295                                         seccomp,
2296                                         SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2297                                         id,
2298                                         0);
2299                         if (r < 0)
2300                                 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2301                 }
2302 
2303                 (void) block_open_flag(seccomp, O_SYNC);
2304 #if O_DSYNC != O_SYNC
2305                 (void) block_open_flag(seccomp, O_DSYNC);
2306 #endif
2307 
2308                 r = seccomp_load(seccomp);
2309                 if (ERRNO_IS_SECCOMP_FATAL(r))
2310                         return r;
2311                 if (r < 0)
2312                         log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2313         }
2314 
2315         return 0;
2316 }
2317