1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <errno.h>
4 #include <limits.h>
5 #include <signal.h>
6 #include <stddef.h>
7 #include <stdlib.h>
8 #include <sys/types.h>
9 #include <sys/utsname.h>
10 #include <sys/xattr.h>
11 #include <unistd.h>
12 
13 #include "alloc-util.h"
14 #include "cgroup-util.h"
15 #include "def.h"
16 #include "dirent-util.h"
17 #include "extract-word.h"
18 #include "fd-util.h"
19 #include "fileio.h"
20 #include "format-util.h"
21 #include "fs-util.h"
22 #include "log.h"
23 #include "login-util.h"
24 #include "macro.h"
25 #include "missing_magic.h"
26 #include "mkdir.h"
27 #include "parse-util.h"
28 #include "path-util.h"
29 #include "process-util.h"
30 #include "set.h"
31 #include "special.h"
32 #include "stat-util.h"
33 #include "stdio-util.h"
34 #include "string-table.h"
35 #include "string-util.h"
36 #include "strv.h"
37 #include "unit-name.h"
38 #include "user-util.h"
39 #include "xattr-util.h"
40 
cg_enumerate_items(const char * controller,const char * path,FILE ** _f,const char * item)41 static int cg_enumerate_items(const char *controller, const char *path, FILE **_f, const char *item) {
42         _cleanup_free_ char *fs = NULL;
43         FILE *f;
44         int r;
45 
46         assert(_f);
47 
48         r = cg_get_path(controller, path, item, &fs);
49         if (r < 0)
50                 return r;
51 
52         f = fopen(fs, "re");
53         if (!f)
54                 return -errno;
55 
56         *_f = f;
57         return 0;
58 }
59 
cg_enumerate_processes(const char * controller,const char * path,FILE ** _f)60 int cg_enumerate_processes(const char *controller, const char *path, FILE **_f) {
61         return cg_enumerate_items(controller, path, _f, "cgroup.procs");
62 }
63 
cg_read_pid(FILE * f,pid_t * _pid)64 int cg_read_pid(FILE *f, pid_t *_pid) {
65         unsigned long ul;
66 
67         /* Note that the cgroup.procs might contain duplicates! See
68          * cgroups.txt for details. */
69 
70         assert(f);
71         assert(_pid);
72 
73         errno = 0;
74         if (fscanf(f, "%lu", &ul) != 1) {
75 
76                 if (feof(f))
77                         return 0;
78 
79                 return errno_or_else(EIO);
80         }
81 
82         if (ul <= 0)
83                 return -EIO;
84 
85         *_pid = (pid_t) ul;
86         return 1;
87 }
88 
cg_read_event(const char * controller,const char * path,const char * event,char ** ret)89 int cg_read_event(
90                 const char *controller,
91                 const char *path,
92                 const char *event,
93                 char **ret) {
94 
95         _cleanup_free_ char *events = NULL, *content = NULL;
96         int r;
97 
98         r = cg_get_path(controller, path, "cgroup.events", &events);
99         if (r < 0)
100                 return r;
101 
102         r = read_full_virtual_file(events, &content, NULL);
103         if (r < 0)
104                 return r;
105 
106         for (const char *p = content;;) {
107                 _cleanup_free_ char *line = NULL, *key = NULL, *val = NULL;
108                 const char *q;
109 
110                 r = extract_first_word(&p, &line, "\n", 0);
111                 if (r < 0)
112                         return r;
113                 if (r == 0)
114                         return -ENOENT;
115 
116                 q = line;
117                 r = extract_first_word(&q, &key, " ", 0);
118                 if (r < 0)
119                         return r;
120                 if (r == 0)
121                         return -EINVAL;
122 
123                 if (!streq(key, event))
124                         continue;
125 
126                 val = strdup(q);
127                 if (!val)
128                         return -ENOMEM;
129 
130                 *ret = TAKE_PTR(val);
131                 return 0;
132         }
133 }
134 
cg_ns_supported(void)135 bool cg_ns_supported(void) {
136         static thread_local int enabled = -1;
137 
138         if (enabled >= 0)
139                 return enabled;
140 
141         if (access("/proc/self/ns/cgroup", F_OK) < 0) {
142                 if (errno != ENOENT)
143                         log_debug_errno(errno, "Failed to check whether /proc/self/ns/cgroup is available, assuming not: %m");
144                 enabled = false;
145         } else
146                 enabled = true;
147 
148         return enabled;
149 }
150 
cg_freezer_supported(void)151 bool cg_freezer_supported(void) {
152         static thread_local int supported = -1;
153 
154         if (supported >= 0)
155                 return supported;
156 
157         supported = cg_all_unified() > 0 && access("/sys/fs/cgroup/init.scope/cgroup.freeze", F_OK) == 0;
158 
159         return supported;
160 }
161 
cg_kill_supported(void)162 bool cg_kill_supported(void) {
163         static thread_local int supported = -1;
164 
165         if (supported >= 0)
166                 return supported;
167 
168         if (cg_all_unified() <= 0)
169                 supported = false;
170         else if (access("/sys/fs/cgroup/init.scope/cgroup.kill", F_OK) < 0) {
171                 if (errno != ENOENT)
172                         log_debug_errno(errno, "Failed to check if cgroup.kill is available, assuming not: %m");
173                 supported = false;
174         } else
175                 supported = true;
176 
177         return supported;
178 }
179 
cg_enumerate_subgroups(const char * controller,const char * path,DIR ** _d)180 int cg_enumerate_subgroups(const char *controller, const char *path, DIR **_d) {
181         _cleanup_free_ char *fs = NULL;
182         int r;
183         DIR *d;
184 
185         assert(_d);
186 
187         /* This is not recursive! */
188 
189         r = cg_get_path(controller, path, NULL, &fs);
190         if (r < 0)
191                 return r;
192 
193         d = opendir(fs);
194         if (!d)
195                 return -errno;
196 
197         *_d = d;
198         return 0;
199 }
200 
cg_read_subgroup(DIR * d,char ** fn)201 int cg_read_subgroup(DIR *d, char **fn) {
202         assert(d);
203         assert(fn);
204 
205         FOREACH_DIRENT_ALL(de, d, return -errno) {
206                 char *b;
207 
208                 if (de->d_type != DT_DIR)
209                         continue;
210 
211                 if (dot_or_dot_dot(de->d_name))
212                         continue;
213 
214                 b = strdup(de->d_name);
215                 if (!b)
216                         return -ENOMEM;
217 
218                 *fn = b;
219                 return 1;
220         }
221 
222         return 0;
223 }
224 
cg_rmdir(const char * controller,const char * path)225 int cg_rmdir(const char *controller, const char *path) {
226         _cleanup_free_ char *p = NULL;
227         int r;
228 
229         r = cg_get_path(controller, path, NULL, &p);
230         if (r < 0)
231                 return r;
232 
233         r = rmdir(p);
234         if (r < 0 && errno != ENOENT)
235                 return -errno;
236 
237         r = cg_hybrid_unified();
238         if (r <= 0)
239                 return r;
240 
241         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
242                 r = cg_rmdir(SYSTEMD_CGROUP_CONTROLLER_LEGACY, path);
243                 if (r < 0)
244                         log_warning_errno(r, "Failed to remove compat systemd cgroup %s: %m", path);
245         }
246 
247         return 0;
248 }
249 
cg_kill_items(const char * controller,const char * path,int sig,CGroupFlags flags,Set * s,cg_kill_log_func_t log_kill,void * userdata,const char * item)250 static int cg_kill_items(
251                 const char *controller,
252                 const char *path,
253                 int sig,
254                 CGroupFlags flags,
255                 Set *s,
256                 cg_kill_log_func_t log_kill,
257                 void *userdata,
258                 const char *item) {
259 
260         _cleanup_set_free_ Set *allocated_set = NULL;
261         bool done = false;
262         int r, ret = 0, ret_log_kill = 0;
263         pid_t my_pid;
264 
265         assert(sig >= 0);
266 
267          /* Don't send SIGCONT twice. Also, SIGKILL always works even when process is suspended, hence don't send
268           * SIGCONT on SIGKILL. */
269         if (IN_SET(sig, SIGCONT, SIGKILL))
270                 flags &= ~CGROUP_SIGCONT;
271 
272         /* This goes through the tasks list and kills them all. This
273          * is repeated until no further processes are added to the
274          * tasks list, to properly handle forking processes */
275 
276         if (!s) {
277                 s = allocated_set = set_new(NULL);
278                 if (!s)
279                         return -ENOMEM;
280         }
281 
282         my_pid = getpid_cached();
283 
284         do {
285                 _cleanup_fclose_ FILE *f = NULL;
286                 pid_t pid = 0;
287                 done = true;
288 
289                 r = cg_enumerate_items(controller, path, &f, item);
290                 if (r < 0) {
291                         if (ret >= 0 && r != -ENOENT)
292                                 return r;
293 
294                         return ret;
295                 }
296 
297                 while ((r = cg_read_pid(f, &pid)) > 0) {
298 
299                         if ((flags & CGROUP_IGNORE_SELF) && pid == my_pid)
300                                 continue;
301 
302                         if (set_get(s, PID_TO_PTR(pid)) == PID_TO_PTR(pid))
303                                 continue;
304 
305                         if (log_kill)
306                                 ret_log_kill = log_kill(pid, sig, userdata);
307 
308                         /* If we haven't killed this process yet, kill
309                          * it */
310                         if (kill(pid, sig) < 0) {
311                                 if (ret >= 0 && errno != ESRCH)
312                                         ret = -errno;
313                         } else {
314                                 if (flags & CGROUP_SIGCONT)
315                                         (void) kill(pid, SIGCONT);
316 
317                                 if (ret == 0) {
318                                         if (log_kill)
319                                                 ret = ret_log_kill;
320                                         else
321                                                 ret = 1;
322                                 }
323                         }
324 
325                         done = false;
326 
327                         r = set_put(s, PID_TO_PTR(pid));
328                         if (r < 0) {
329                                 if (ret >= 0)
330                                         return r;
331 
332                                 return ret;
333                         }
334                 }
335 
336                 if (r < 0) {
337                         if (ret >= 0)
338                                 return r;
339 
340                         return ret;
341                 }
342 
343                 /* To avoid racing against processes which fork
344                  * quicker than we can kill them we repeat this until
345                  * no new pids need to be killed. */
346 
347         } while (!done);
348 
349         return ret;
350 }
351 
cg_kill(const char * controller,const char * path,int sig,CGroupFlags flags,Set * s,cg_kill_log_func_t log_kill,void * userdata)352 int cg_kill(
353                 const char *controller,
354                 const char *path,
355                 int sig,
356                 CGroupFlags flags,
357                 Set *s,
358                 cg_kill_log_func_t log_kill,
359                 void *userdata) {
360         int r;
361 
362         r = cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.procs");
363         if (r < 0 || sig != SIGKILL)
364                 return r;
365 
366         /* Only in case of killing with SIGKILL and when using cgroupsv2, kill remaining threads manually as
367            a workaround for kernel bug. It was fixed in 5.2-rc5 (c03cd7738a83), backported to 4.19.66
368            (4340d175b898) and 4.14.138 (feb6b123b7dd). */
369         r = cg_unified_controller(controller);
370         if (r <= 0)
371                 return r;
372 
373         return cg_kill_items(controller, path, sig, flags, s, log_kill, userdata, "cgroup.threads");
374 }
375 
cg_kill_kernel_sigkill(const char * controller,const char * path)376 int cg_kill_kernel_sigkill(const char *controller, const char *path) {
377         /* Kills the cgroup at `path` directly by writing to its cgroup.kill file.
378          * This sends SIGKILL to all processes in the cgroup and has the advantage of
379          * being completely atomic, unlike cg_kill_items. */
380         int r;
381         _cleanup_free_ char *killfile = NULL;
382 
383         assert(path);
384 
385         if (!cg_kill_supported())
386                 return -EOPNOTSUPP;
387 
388         r = cg_get_path(controller, path, "cgroup.kill", &killfile);
389         if (r < 0)
390                 return r;
391 
392         r = write_string_file(killfile, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
393         if (r < 0)
394                 return r;
395 
396         return 0;
397 }
398 
cg_kill_recursive(const char * controller,const char * path,int sig,CGroupFlags flags,Set * s,cg_kill_log_func_t log_kill,void * userdata)399 int cg_kill_recursive(
400                 const char *controller,
401                 const char *path,
402                 int sig,
403                 CGroupFlags flags,
404                 Set *s,
405                 cg_kill_log_func_t log_kill,
406                 void *userdata) {
407 
408         _cleanup_set_free_ Set *allocated_set = NULL;
409         _cleanup_closedir_ DIR *d = NULL;
410         int r, ret;
411         char *fn;
412 
413         assert(path);
414         assert(sig >= 0);
415 
416         if (sig == SIGKILL && cg_kill_supported() &&
417             !FLAGS_SET(flags, CGROUP_IGNORE_SELF) && !s && !log_kill) {
418                 /* ignore CGROUP_SIGCONT, since this is a no-op alongside SIGKILL */
419                 ret = cg_kill_kernel_sigkill(controller, path);
420                 if (ret < 0)
421                         return ret;
422         } else {
423                 if (!s) {
424                         s = allocated_set = set_new(NULL);
425                         if (!s)
426                                 return -ENOMEM;
427                 }
428 
429                 ret = cg_kill(controller, path, sig, flags, s, log_kill, userdata);
430 
431                 r = cg_enumerate_subgroups(controller, path, &d);
432                 if (r < 0) {
433                         if (ret >= 0 && r != -ENOENT)
434                                 return r;
435 
436                         return ret;
437                 }
438 
439                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
440                         _cleanup_free_ char *p = NULL;
441 
442                         p = path_join(empty_to_root(path), fn);
443                         free(fn);
444                         if (!p)
445                                 return -ENOMEM;
446 
447                         r = cg_kill_recursive(controller, p, sig, flags, s, log_kill, userdata);
448                         if (r != 0 && ret >= 0)
449                                 ret = r;
450                 }
451                 if (ret >= 0 && r < 0)
452                         ret = r;
453         }
454 
455         if (FLAGS_SET(flags, CGROUP_REMOVE)) {
456                 r = cg_rmdir(controller, path);
457                 if (r < 0 && ret >= 0 && !IN_SET(r, -ENOENT, -EBUSY))
458                         return r;
459         }
460 
461         return ret;
462 }
463 
controller_to_dirname(const char * controller)464 static const char *controller_to_dirname(const char *controller) {
465         assert(controller);
466 
467         /* Converts a controller name to the directory name below /sys/fs/cgroup/ we want to mount it
468          * to. Effectively, this just cuts off the name= prefixed used for named hierarchies, if it is
469          * specified. */
470 
471         if (streq(controller, SYSTEMD_CGROUP_CONTROLLER)) {
472                 if (cg_hybrid_unified() > 0)
473                         controller = SYSTEMD_CGROUP_CONTROLLER_HYBRID;
474                 else
475                         controller = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
476         }
477 
478         return startswith(controller, "name=") ?: controller;
479 }
480 
join_path_legacy(const char * controller,const char * path,const char * suffix,char ** ret)481 static int join_path_legacy(const char *controller, const char *path, const char *suffix, char **ret) {
482         const char *dn;
483         char *t = NULL;
484 
485         assert(ret);
486         assert(controller);
487 
488         dn = controller_to_dirname(controller);
489 
490         if (isempty(path) && isempty(suffix))
491                 t = path_join("/sys/fs/cgroup", dn);
492         else if (isempty(path))
493                 t = path_join("/sys/fs/cgroup", dn, suffix);
494         else if (isempty(suffix))
495                 t = path_join("/sys/fs/cgroup", dn, path);
496         else
497                 t = path_join("/sys/fs/cgroup", dn, path, suffix);
498         if (!t)
499                 return -ENOMEM;
500 
501         *ret = t;
502         return 0;
503 }
504 
join_path_unified(const char * path,const char * suffix,char ** ret)505 static int join_path_unified(const char *path, const char *suffix, char **ret) {
506         char *t;
507 
508         assert(ret);
509 
510         if (isempty(path) && isempty(suffix))
511                 t = strdup("/sys/fs/cgroup");
512         else if (isempty(path))
513                 t = path_join("/sys/fs/cgroup", suffix);
514         else if (isempty(suffix))
515                 t = path_join("/sys/fs/cgroup", path);
516         else
517                 t = path_join("/sys/fs/cgroup", path, suffix);
518         if (!t)
519                 return -ENOMEM;
520 
521         *ret = t;
522         return 0;
523 }
524 
cg_get_path(const char * controller,const char * path,const char * suffix,char ** ret)525 int cg_get_path(const char *controller, const char *path, const char *suffix, char **ret) {
526         int r;
527 
528         assert(ret);
529 
530         if (!controller) {
531                 char *t;
532 
533                 /* If no controller is specified, we return the path *below* the controllers, without any
534                  * prefix. */
535 
536                 if (isempty(path) && isempty(suffix))
537                         return -EINVAL;
538 
539                 if (isempty(suffix))
540                         t = strdup(path);
541                 else if (isempty(path))
542                         t = strdup(suffix);
543                 else
544                         t = path_join(path, suffix);
545                 if (!t)
546                         return -ENOMEM;
547 
548                 *ret = path_simplify(t);
549                 return 0;
550         }
551 
552         if (!cg_controller_is_valid(controller))
553                 return -EINVAL;
554 
555         r = cg_all_unified();
556         if (r < 0)
557                 return r;
558         if (r > 0)
559                 r = join_path_unified(path, suffix, ret);
560         else
561                 r = join_path_legacy(controller, path, suffix, ret);
562         if (r < 0)
563                 return r;
564 
565         path_simplify(*ret);
566         return 0;
567 }
568 
controller_is_v1_accessible(const char * root,const char * controller)569 static int controller_is_v1_accessible(const char *root, const char *controller) {
570         const char *cpath, *dn;
571 
572         assert(controller);
573 
574         dn = controller_to_dirname(controller);
575 
576         /* If root if specified, we check that:
577          * - possible subcgroup is created at root,
578          * - we can modify the hierarchy. */
579 
580         cpath = strjoina("/sys/fs/cgroup/", dn, root, root ? "/cgroup.procs" : NULL);
581         return laccess(cpath, root ? W_OK : F_OK);
582 }
583 
cg_get_path_and_check(const char * controller,const char * path,const char * suffix,char ** fs)584 int cg_get_path_and_check(const char *controller, const char *path, const char *suffix, char **fs) {
585         int r;
586 
587         assert(controller);
588         assert(fs);
589 
590         if (!cg_controller_is_valid(controller))
591                 return -EINVAL;
592 
593         r = cg_all_unified();
594         if (r < 0)
595                 return r;
596         if (r > 0) {
597                 /* In the unified hierarchy all controllers are considered accessible,
598                  * except for the named hierarchies */
599                 if (startswith(controller, "name="))
600                         return -EOPNOTSUPP;
601         } else {
602                 /* Check if the specified controller is actually accessible */
603                 r = controller_is_v1_accessible(NULL, controller);
604                 if (r < 0)
605                         return r;
606         }
607 
608         return cg_get_path(controller, path, suffix, fs);
609 }
610 
cg_set_xattr(const char * controller,const char * path,const char * name,const void * value,size_t size,int flags)611 int cg_set_xattr(const char *controller, const char *path, const char *name, const void *value, size_t size, int flags) {
612         _cleanup_free_ char *fs = NULL;
613         int r;
614 
615         assert(path);
616         assert(name);
617         assert(value || size <= 0);
618 
619         r = cg_get_path(controller, path, NULL, &fs);
620         if (r < 0)
621                 return r;
622 
623         return RET_NERRNO(setxattr(fs, name, value, size, flags));
624 }
625 
cg_get_xattr(const char * controller,const char * path,const char * name,void * value,size_t size)626 int cg_get_xattr(const char *controller, const char *path, const char *name, void *value, size_t size) {
627         _cleanup_free_ char *fs = NULL;
628         ssize_t n;
629         int r;
630 
631         assert(path);
632         assert(name);
633 
634         r = cg_get_path(controller, path, NULL, &fs);
635         if (r < 0)
636                 return r;
637 
638         n = getxattr(fs, name, value, size);
639         if (n < 0)
640                 return -errno;
641 
642         return (int) n;
643 }
644 
cg_get_xattr_malloc(const char * controller,const char * path,const char * name,char ** ret)645 int cg_get_xattr_malloc(const char *controller, const char *path, const char *name, char **ret) {
646         _cleanup_free_ char *fs = NULL;
647         int r;
648 
649         assert(path);
650         assert(name);
651 
652         r = cg_get_path(controller, path, NULL, &fs);
653         if (r < 0)
654                 return r;
655 
656         r = lgetxattr_malloc(fs, name, ret);
657         if (r < 0)
658                 return r;
659 
660         return r;
661 }
662 
cg_get_xattr_bool(const char * controller,const char * path,const char * name)663 int cg_get_xattr_bool(const char *controller, const char *path, const char *name) {
664         _cleanup_free_ char *val = NULL;
665         int r;
666 
667         assert(path);
668         assert(name);
669 
670         r = cg_get_xattr_malloc(controller, path, name, &val);
671         if (r < 0)
672                 return r;
673 
674         return parse_boolean(val);
675 }
676 
cg_remove_xattr(const char * controller,const char * path,const char * name)677 int cg_remove_xattr(const char *controller, const char *path, const char *name) {
678         _cleanup_free_ char *fs = NULL;
679         int r;
680 
681         assert(path);
682         assert(name);
683 
684         r = cg_get_path(controller, path, NULL, &fs);
685         if (r < 0)
686                 return r;
687 
688         return RET_NERRNO(removexattr(fs, name));
689 }
690 
cg_pid_get_path(const char * controller,pid_t pid,char ** ret_path)691 int cg_pid_get_path(const char *controller, pid_t pid, char **ret_path) {
692         _cleanup_fclose_ FILE *f = NULL;
693         const char *fs, *controller_str = NULL;  /* avoid false maybe-uninitialized warning */
694         int unified, r;
695 
696         assert(pid >= 0);
697         assert(ret_path);
698 
699         if (controller) {
700                 if (!cg_controller_is_valid(controller))
701                         return -EINVAL;
702         } else
703                 controller = SYSTEMD_CGROUP_CONTROLLER;
704 
705         unified = cg_unified_controller(controller);
706         if (unified < 0)
707                 return unified;
708         if (unified == 0) {
709                 if (streq(controller, SYSTEMD_CGROUP_CONTROLLER))
710                         controller_str = SYSTEMD_CGROUP_CONTROLLER_LEGACY;
711                 else
712                         controller_str = controller;
713         }
714 
715         fs = procfs_file_alloca(pid, "cgroup");
716         r = fopen_unlocked(fs, "re", &f);
717         if (r == -ENOENT)
718                 return -ESRCH;
719         if (r < 0)
720                 return r;
721 
722         for (;;) {
723                 _cleanup_free_ char *line = NULL;
724                 char *e;
725 
726                 r = read_line(f, LONG_LINE_MAX, &line);
727                 if (r < 0)
728                         return r;
729                 if (r == 0)
730                         return -ENODATA;
731 
732                 if (unified) {
733                         e = startswith(line, "0:");
734                         if (!e)
735                                 continue;
736 
737                         e = strchr(e, ':');
738                         if (!e)
739                                 continue;
740                 } else {
741                         char *l;
742 
743                         l = strchr(line, ':');
744                         if (!l)
745                                 continue;
746 
747                         l++;
748                         e = strchr(l, ':');
749                         if (!e)
750                                 continue;
751                         *e = 0;
752 
753                         assert(controller_str);
754                         r = string_contains_word(l, ",", controller_str);
755                         if (r < 0)
756                                 return r;
757                         if (r == 0)
758                                 continue;
759                 }
760 
761                 char *path = strdup(e + 1);
762                 if (!path)
763                         return -ENOMEM;
764 
765                 /* Truncate suffix indicating the process is a zombie */
766                 e = endswith(path, " (deleted)");
767                 if (e)
768                         *e = 0;
769 
770                 *ret_path = path;
771                 return 0;
772         }
773 }
774 
cg_install_release_agent(const char * controller,const char * agent)775 int cg_install_release_agent(const char *controller, const char *agent) {
776         _cleanup_free_ char *fs = NULL, *contents = NULL;
777         const char *sc;
778         int r;
779 
780         assert(agent);
781 
782         r = cg_unified_controller(controller);
783         if (r < 0)
784                 return r;
785         if (r > 0) /* doesn't apply to unified hierarchy */
786                 return -EOPNOTSUPP;
787 
788         r = cg_get_path(controller, NULL, "release_agent", &fs);
789         if (r < 0)
790                 return r;
791 
792         r = read_one_line_file(fs, &contents);
793         if (r < 0)
794                 return r;
795 
796         sc = strstrip(contents);
797         if (isempty(sc)) {
798                 r = write_string_file(fs, agent, WRITE_STRING_FILE_DISABLE_BUFFER);
799                 if (r < 0)
800                         return r;
801         } else if (!path_equal(sc, agent))
802                 return -EEXIST;
803 
804         fs = mfree(fs);
805         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
806         if (r < 0)
807                 return r;
808 
809         contents = mfree(contents);
810         r = read_one_line_file(fs, &contents);
811         if (r < 0)
812                 return r;
813 
814         sc = strstrip(contents);
815         if (streq(sc, "0")) {
816                 r = write_string_file(fs, "1", WRITE_STRING_FILE_DISABLE_BUFFER);
817                 if (r < 0)
818                         return r;
819 
820                 return 1;
821         }
822 
823         if (!streq(sc, "1"))
824                 return -EIO;
825 
826         return 0;
827 }
828 
cg_uninstall_release_agent(const char * controller)829 int cg_uninstall_release_agent(const char *controller) {
830         _cleanup_free_ char *fs = NULL;
831         int r;
832 
833         r = cg_unified_controller(controller);
834         if (r < 0)
835                 return r;
836         if (r > 0) /* Doesn't apply to unified hierarchy */
837                 return -EOPNOTSUPP;
838 
839         r = cg_get_path(controller, NULL, "notify_on_release", &fs);
840         if (r < 0)
841                 return r;
842 
843         r = write_string_file(fs, "0", WRITE_STRING_FILE_DISABLE_BUFFER);
844         if (r < 0)
845                 return r;
846 
847         fs = mfree(fs);
848 
849         r = cg_get_path(controller, NULL, "release_agent", &fs);
850         if (r < 0)
851                 return r;
852 
853         r = write_string_file(fs, "", WRITE_STRING_FILE_DISABLE_BUFFER);
854         if (r < 0)
855                 return r;
856 
857         return 0;
858 }
859 
cg_is_empty(const char * controller,const char * path)860 int cg_is_empty(const char *controller, const char *path) {
861         _cleanup_fclose_ FILE *f = NULL;
862         pid_t pid;
863         int r;
864 
865         assert(path);
866 
867         r = cg_enumerate_processes(controller, path, &f);
868         if (r == -ENOENT)
869                 return true;
870         if (r < 0)
871                 return r;
872 
873         r = cg_read_pid(f, &pid);
874         if (r < 0)
875                 return r;
876 
877         return r == 0;
878 }
879 
cg_is_empty_recursive(const char * controller,const char * path)880 int cg_is_empty_recursive(const char *controller, const char *path) {
881         int r;
882 
883         assert(path);
884 
885         /* The root cgroup is always populated */
886         if (controller && empty_or_root(path))
887                 return false;
888 
889         r = cg_unified_controller(controller);
890         if (r < 0)
891                 return r;
892         if (r > 0) {
893                 _cleanup_free_ char *t = NULL;
894 
895                 /* On the unified hierarchy we can check empty state
896                  * via the "populated" attribute of "cgroup.events". */
897 
898                 r = cg_read_event(controller, path, "populated", &t);
899                 if (r == -ENOENT)
900                         return true;
901                 if (r < 0)
902                         return r;
903 
904                 return streq(t, "0");
905         } else {
906                 _cleanup_closedir_ DIR *d = NULL;
907                 char *fn;
908 
909                 r = cg_is_empty(controller, path);
910                 if (r <= 0)
911                         return r;
912 
913                 r = cg_enumerate_subgroups(controller, path, &d);
914                 if (r == -ENOENT)
915                         return true;
916                 if (r < 0)
917                         return r;
918 
919                 while ((r = cg_read_subgroup(d, &fn)) > 0) {
920                         _cleanup_free_ char *p = NULL;
921 
922                         p = path_join(path, fn);
923                         free(fn);
924                         if (!p)
925                                 return -ENOMEM;
926 
927                         r = cg_is_empty_recursive(controller, p);
928                         if (r <= 0)
929                                 return r;
930                 }
931                 if (r < 0)
932                         return r;
933 
934                 return true;
935         }
936 }
937 
cg_split_spec(const char * spec,char ** ret_controller,char ** ret_path)938 int cg_split_spec(const char *spec, char **ret_controller, char **ret_path) {
939         _cleanup_free_ char *controller = NULL, *path = NULL;
940 
941         assert(spec);
942 
943         if (*spec == '/') {
944                 if (!path_is_normalized(spec))
945                         return -EINVAL;
946 
947                 if (ret_path) {
948                         path = strdup(spec);
949                         if (!path)
950                                 return -ENOMEM;
951 
952                         path_simplify(path);
953                 }
954 
955         } else {
956                 const char *e;
957 
958                 e = strchr(spec, ':');
959                 if (e) {
960                         controller = strndup(spec, e-spec);
961                         if (!controller)
962                                 return -ENOMEM;
963                         if (!cg_controller_is_valid(controller))
964                                 return -EINVAL;
965 
966                         if (!isempty(e + 1)) {
967                                 path = strdup(e+1);
968                                 if (!path)
969                                         return -ENOMEM;
970 
971                                 if (!path_is_normalized(path) ||
972                                     !path_is_absolute(path))
973                                         return -EINVAL;
974 
975                                 path_simplify(path);
976                         }
977 
978                 } else {
979                         if (!cg_controller_is_valid(spec))
980                                 return -EINVAL;
981 
982                         if (ret_controller) {
983                                 controller = strdup(spec);
984                                 if (!controller)
985                                         return -ENOMEM;
986                         }
987                 }
988         }
989 
990         if (ret_controller)
991                 *ret_controller = TAKE_PTR(controller);
992         if (ret_path)
993                 *ret_path = TAKE_PTR(path);
994         return 0;
995 }
996 
cg_mangle_path(const char * path,char ** result)997 int cg_mangle_path(const char *path, char **result) {
998         _cleanup_free_ char *c = NULL, *p = NULL;
999         char *t;
1000         int r;
1001 
1002         assert(path);
1003         assert(result);
1004 
1005         /* First, check if it already is a filesystem path */
1006         if (path_startswith(path, "/sys/fs/cgroup")) {
1007 
1008                 t = strdup(path);
1009                 if (!t)
1010                         return -ENOMEM;
1011 
1012                 *result = path_simplify(t);
1013                 return 0;
1014         }
1015 
1016         /* Otherwise, treat it as cg spec */
1017         r = cg_split_spec(path, &c, &p);
1018         if (r < 0)
1019                 return r;
1020 
1021         return cg_get_path(c ?: SYSTEMD_CGROUP_CONTROLLER, p ?: "/", NULL, result);
1022 }
1023 
cg_get_root_path(char ** path)1024 int cg_get_root_path(char **path) {
1025         char *p, *e;
1026         int r;
1027 
1028         assert(path);
1029 
1030         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, 1, &p);
1031         if (r < 0)
1032                 return r;
1033 
1034         e = endswith(p, "/" SPECIAL_INIT_SCOPE);
1035         if (!e)
1036                 e = endswith(p, "/" SPECIAL_SYSTEM_SLICE); /* legacy */
1037         if (!e)
1038                 e = endswith(p, "/system"); /* even more legacy */
1039         if (e)
1040                 *e = 0;
1041 
1042         *path = p;
1043         return 0;
1044 }
1045 
cg_shift_path(const char * cgroup,const char * root,const char ** shifted)1046 int cg_shift_path(const char *cgroup, const char *root, const char **shifted) {
1047         _cleanup_free_ char *rt = NULL;
1048         char *p;
1049         int r;
1050 
1051         assert(cgroup);
1052         assert(shifted);
1053 
1054         if (!root) {
1055                 /* If the root was specified let's use that, otherwise
1056                  * let's determine it from PID 1 */
1057 
1058                 r = cg_get_root_path(&rt);
1059                 if (r < 0)
1060                         return r;
1061 
1062                 root = rt;
1063         }
1064 
1065         p = path_startswith(cgroup, root);
1066         if (p && p > cgroup)
1067                 *shifted = p - 1;
1068         else
1069                 *shifted = cgroup;
1070 
1071         return 0;
1072 }
1073 
cg_pid_get_path_shifted(pid_t pid,const char * root,char ** cgroup)1074 int cg_pid_get_path_shifted(pid_t pid, const char *root, char **cgroup) {
1075         _cleanup_free_ char *raw = NULL;
1076         const char *c;
1077         int r;
1078 
1079         assert(pid >= 0);
1080         assert(cgroup);
1081 
1082         r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &raw);
1083         if (r < 0)
1084                 return r;
1085 
1086         r = cg_shift_path(raw, root, &c);
1087         if (r < 0)
1088                 return r;
1089 
1090         if (c == raw)
1091                 *cgroup = TAKE_PTR(raw);
1092         else {
1093                 char *n;
1094 
1095                 n = strdup(c);
1096                 if (!n)
1097                         return -ENOMEM;
1098 
1099                 *cgroup = n;
1100         }
1101 
1102         return 0;
1103 }
1104 
cg_path_decode_unit(const char * cgroup,char ** unit)1105 int cg_path_decode_unit(const char *cgroup, char **unit) {
1106         char *c, *s;
1107         size_t n;
1108 
1109         assert(cgroup);
1110         assert(unit);
1111 
1112         n = strcspn(cgroup, "/");
1113         if (n < 3)
1114                 return -ENXIO;
1115 
1116         c = strndupa_safe(cgroup, n);
1117         c = cg_unescape(c);
1118 
1119         if (!unit_name_is_valid(c, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
1120                 return -ENXIO;
1121 
1122         s = strdup(c);
1123         if (!s)
1124                 return -ENOMEM;
1125 
1126         *unit = s;
1127         return 0;
1128 }
1129 
valid_slice_name(const char * p,size_t n)1130 static bool valid_slice_name(const char *p, size_t n) {
1131 
1132         if (!p)
1133                 return false;
1134 
1135         if (n < STRLEN("x.slice"))
1136                 return false;
1137 
1138         if (memcmp(p + n - 6, ".slice", 6) == 0) {
1139                 char buf[n+1], *c;
1140 
1141                 memcpy(buf, p, n);
1142                 buf[n] = 0;
1143 
1144                 c = cg_unescape(buf);
1145 
1146                 return unit_name_is_valid(c, UNIT_NAME_PLAIN);
1147         }
1148 
1149         return false;
1150 }
1151 
skip_slices(const char * p)1152 static const char *skip_slices(const char *p) {
1153         assert(p);
1154 
1155         /* Skips over all slice assignments */
1156 
1157         for (;;) {
1158                 size_t n;
1159 
1160                 p += strspn(p, "/");
1161 
1162                 n = strcspn(p, "/");
1163                 if (!valid_slice_name(p, n))
1164                         return p;
1165 
1166                 p += n;
1167         }
1168 }
1169 
cg_path_get_unit(const char * path,char ** ret)1170 int cg_path_get_unit(const char *path, char **ret) {
1171         _cleanup_free_ char *unit = NULL;
1172         const char *e;
1173         int r;
1174 
1175         assert(path);
1176         assert(ret);
1177 
1178         e = skip_slices(path);
1179 
1180         r = cg_path_decode_unit(e, &unit);
1181         if (r < 0)
1182                 return r;
1183 
1184         /* We skipped over the slices, don't accept any now */
1185         if (endswith(unit, ".slice"))
1186                 return -ENXIO;
1187 
1188         *ret = TAKE_PTR(unit);
1189         return 0;
1190 }
1191 
cg_pid_get_unit(pid_t pid,char ** unit)1192 int cg_pid_get_unit(pid_t pid, char **unit) {
1193         _cleanup_free_ char *cgroup = NULL;
1194         int r;
1195 
1196         assert(unit);
1197 
1198         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1199         if (r < 0)
1200                 return r;
1201 
1202         return cg_path_get_unit(cgroup, unit);
1203 }
1204 
1205 /**
1206  * Skip session-*.scope, but require it to be there.
1207  */
skip_session(const char * p)1208 static const char *skip_session(const char *p) {
1209         size_t n;
1210 
1211         if (isempty(p))
1212                 return NULL;
1213 
1214         p += strspn(p, "/");
1215 
1216         n = strcspn(p, "/");
1217         if (n < STRLEN("session-x.scope"))
1218                 return NULL;
1219 
1220         if (memcmp(p, "session-", 8) == 0 && memcmp(p + n - 6, ".scope", 6) == 0) {
1221                 char buf[n - 8 - 6 + 1];
1222 
1223                 memcpy(buf, p + 8, n - 8 - 6);
1224                 buf[n - 8 - 6] = 0;
1225 
1226                 /* Note that session scopes never need unescaping,
1227                  * since they cannot conflict with the kernel's own
1228                  * names, hence we don't need to call cg_unescape()
1229                  * here. */
1230 
1231                 if (!session_id_valid(buf))
1232                         return false;
1233 
1234                 p += n;
1235                 p += strspn(p, "/");
1236                 return p;
1237         }
1238 
1239         return NULL;
1240 }
1241 
1242 /**
1243  * Skip user@*.service, but require it to be there.
1244  */
skip_user_manager(const char * p)1245 static const char *skip_user_manager(const char *p) {
1246         size_t n;
1247 
1248         if (isempty(p))
1249                 return NULL;
1250 
1251         p += strspn(p, "/");
1252 
1253         n = strcspn(p, "/");
1254         if (n < STRLEN("user@x.service"))
1255                 return NULL;
1256 
1257         if (memcmp(p, "user@", 5) == 0 && memcmp(p + n - 8, ".service", 8) == 0) {
1258                 char buf[n - 5 - 8 + 1];
1259 
1260                 memcpy(buf, p + 5, n - 5 - 8);
1261                 buf[n - 5 - 8] = 0;
1262 
1263                 /* Note that user manager services never need unescaping,
1264                  * since they cannot conflict with the kernel's own
1265                  * names, hence we don't need to call cg_unescape()
1266                  * here. */
1267 
1268                 if (parse_uid(buf, NULL) < 0)
1269                         return NULL;
1270 
1271                 p += n;
1272                 p += strspn(p, "/");
1273 
1274                 return p;
1275         }
1276 
1277         return NULL;
1278 }
1279 
skip_user_prefix(const char * path)1280 static const char *skip_user_prefix(const char *path) {
1281         const char *e, *t;
1282 
1283         assert(path);
1284 
1285         /* Skip slices, if there are any */
1286         e = skip_slices(path);
1287 
1288         /* Skip the user manager, if it's in the path now... */
1289         t = skip_user_manager(e);
1290         if (t)
1291                 return t;
1292 
1293         /* Alternatively skip the user session if it is in the path... */
1294         return skip_session(e);
1295 }
1296 
cg_path_get_user_unit(const char * path,char ** ret)1297 int cg_path_get_user_unit(const char *path, char **ret) {
1298         const char *t;
1299 
1300         assert(path);
1301         assert(ret);
1302 
1303         t = skip_user_prefix(path);
1304         if (!t)
1305                 return -ENXIO;
1306 
1307         /* And from here on it looks pretty much the same as for a system unit, hence let's use the same
1308          * parser. */
1309         return cg_path_get_unit(t, ret);
1310 }
1311 
cg_pid_get_user_unit(pid_t pid,char ** unit)1312 int cg_pid_get_user_unit(pid_t pid, char **unit) {
1313         _cleanup_free_ char *cgroup = NULL;
1314         int r;
1315 
1316         assert(unit);
1317 
1318         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1319         if (r < 0)
1320                 return r;
1321 
1322         return cg_path_get_user_unit(cgroup, unit);
1323 }
1324 
cg_path_get_machine_name(const char * path,char ** machine)1325 int cg_path_get_machine_name(const char *path, char **machine) {
1326         _cleanup_free_ char *u = NULL;
1327         const char *sl;
1328         int r;
1329 
1330         r = cg_path_get_unit(path, &u);
1331         if (r < 0)
1332                 return r;
1333 
1334         sl = strjoina("/run/systemd/machines/unit:", u);
1335         return readlink_malloc(sl, machine);
1336 }
1337 
cg_pid_get_machine_name(pid_t pid,char ** machine)1338 int cg_pid_get_machine_name(pid_t pid, char **machine) {
1339         _cleanup_free_ char *cgroup = NULL;
1340         int r;
1341 
1342         assert(machine);
1343 
1344         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1345         if (r < 0)
1346                 return r;
1347 
1348         return cg_path_get_machine_name(cgroup, machine);
1349 }
1350 
cg_path_get_cgroupid(const char * path,uint64_t * ret)1351 int cg_path_get_cgroupid(const char *path, uint64_t *ret) {
1352         cg_file_handle fh = CG_FILE_HANDLE_INIT;
1353         int mnt_id = -1;
1354 
1355         assert(path);
1356         assert(ret);
1357 
1358         /* This is cgroupfs so we know the size of the handle, thus no need to loop around like
1359          * name_to_handle_at_loop() does in mountpoint-util.c */
1360         if (name_to_handle_at(AT_FDCWD, path, &fh.file_handle, &mnt_id, 0) < 0)
1361                 return -errno;
1362 
1363         *ret = CG_FILE_HANDLE_CGROUPID(fh);
1364         return 0;
1365 }
1366 
cg_path_get_session(const char * path,char ** session)1367 int cg_path_get_session(const char *path, char **session) {
1368         _cleanup_free_ char *unit = NULL;
1369         char *start, *end;
1370         int r;
1371 
1372         assert(path);
1373 
1374         r = cg_path_get_unit(path, &unit);
1375         if (r < 0)
1376                 return r;
1377 
1378         start = startswith(unit, "session-");
1379         if (!start)
1380                 return -ENXIO;
1381         end = endswith(start, ".scope");
1382         if (!end)
1383                 return -ENXIO;
1384 
1385         *end = 0;
1386         if (!session_id_valid(start))
1387                 return -ENXIO;
1388 
1389         if (session) {
1390                 char *rr;
1391 
1392                 rr = strdup(start);
1393                 if (!rr)
1394                         return -ENOMEM;
1395 
1396                 *session = rr;
1397         }
1398 
1399         return 0;
1400 }
1401 
cg_pid_get_session(pid_t pid,char ** session)1402 int cg_pid_get_session(pid_t pid, char **session) {
1403         _cleanup_free_ char *cgroup = NULL;
1404         int r;
1405 
1406         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1407         if (r < 0)
1408                 return r;
1409 
1410         return cg_path_get_session(cgroup, session);
1411 }
1412 
cg_path_get_owner_uid(const char * path,uid_t * uid)1413 int cg_path_get_owner_uid(const char *path, uid_t *uid) {
1414         _cleanup_free_ char *slice = NULL;
1415         char *start, *end;
1416         int r;
1417 
1418         assert(path);
1419 
1420         r = cg_path_get_slice(path, &slice);
1421         if (r < 0)
1422                 return r;
1423 
1424         start = startswith(slice, "user-");
1425         if (!start)
1426                 return -ENXIO;
1427         end = endswith(start, ".slice");
1428         if (!end)
1429                 return -ENXIO;
1430 
1431         *end = 0;
1432         if (parse_uid(start, uid) < 0)
1433                 return -ENXIO;
1434 
1435         return 0;
1436 }
1437 
cg_pid_get_owner_uid(pid_t pid,uid_t * uid)1438 int cg_pid_get_owner_uid(pid_t pid, uid_t *uid) {
1439         _cleanup_free_ char *cgroup = NULL;
1440         int r;
1441 
1442         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1443         if (r < 0)
1444                 return r;
1445 
1446         return cg_path_get_owner_uid(cgroup, uid);
1447 }
1448 
cg_path_get_slice(const char * p,char ** slice)1449 int cg_path_get_slice(const char *p, char **slice) {
1450         const char *e = NULL;
1451 
1452         assert(p);
1453         assert(slice);
1454 
1455         /* Finds the right-most slice unit from the beginning, but
1456          * stops before we come to the first non-slice unit. */
1457 
1458         for (;;) {
1459                 size_t n;
1460 
1461                 p += strspn(p, "/");
1462 
1463                 n = strcspn(p, "/");
1464                 if (!valid_slice_name(p, n)) {
1465 
1466                         if (!e) {
1467                                 char *s;
1468 
1469                                 s = strdup(SPECIAL_ROOT_SLICE);
1470                                 if (!s)
1471                                         return -ENOMEM;
1472 
1473                                 *slice = s;
1474                                 return 0;
1475                         }
1476 
1477                         return cg_path_decode_unit(e, slice);
1478                 }
1479 
1480                 e = p;
1481                 p += n;
1482         }
1483 }
1484 
cg_pid_get_slice(pid_t pid,char ** slice)1485 int cg_pid_get_slice(pid_t pid, char **slice) {
1486         _cleanup_free_ char *cgroup = NULL;
1487         int r;
1488 
1489         assert(slice);
1490 
1491         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1492         if (r < 0)
1493                 return r;
1494 
1495         return cg_path_get_slice(cgroup, slice);
1496 }
1497 
cg_path_get_user_slice(const char * p,char ** slice)1498 int cg_path_get_user_slice(const char *p, char **slice) {
1499         const char *t;
1500         assert(p);
1501         assert(slice);
1502 
1503         t = skip_user_prefix(p);
1504         if (!t)
1505                 return -ENXIO;
1506 
1507         /* And now it looks pretty much the same as for a system
1508          * slice, so let's just use the same parser from here on. */
1509         return cg_path_get_slice(t, slice);
1510 }
1511 
cg_pid_get_user_slice(pid_t pid,char ** slice)1512 int cg_pid_get_user_slice(pid_t pid, char **slice) {
1513         _cleanup_free_ char *cgroup = NULL;
1514         int r;
1515 
1516         assert(slice);
1517 
1518         r = cg_pid_get_path_shifted(pid, NULL, &cgroup);
1519         if (r < 0)
1520                 return r;
1521 
1522         return cg_path_get_user_slice(cgroup, slice);
1523 }
1524 
cg_escape(const char * p)1525 char *cg_escape(const char *p) {
1526         bool need_prefix = false;
1527 
1528         /* This implements very minimal escaping for names to be used
1529          * as file names in the cgroup tree: any name which might
1530          * conflict with a kernel name or is prefixed with '_' is
1531          * prefixed with a '_'. That way, when reading cgroup names it
1532          * is sufficient to remove a single prefixing underscore if
1533          * there is one. */
1534 
1535         /* The return value of this function (unlike cg_unescape())
1536          * needs free()! */
1537 
1538         if (IN_SET(p[0], 0, '_', '.') ||
1539             STR_IN_SET(p, "notify_on_release", "release_agent", "tasks") ||
1540             startswith(p, "cgroup."))
1541                 need_prefix = true;
1542         else {
1543                 const char *dot;
1544 
1545                 dot = strrchr(p, '.');
1546                 if (dot) {
1547                         CGroupController c;
1548                         size_t l = dot - p;
1549 
1550                         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1551                                 const char *n;
1552 
1553                                 n = cgroup_controller_to_string(c);
1554 
1555                                 if (l != strlen(n))
1556                                         continue;
1557 
1558                                 if (memcmp(p, n, l) != 0)
1559                                         continue;
1560 
1561                                 need_prefix = true;
1562                                 break;
1563                         }
1564                 }
1565         }
1566 
1567         if (need_prefix)
1568                 return strjoin("_", p);
1569 
1570         return strdup(p);
1571 }
1572 
cg_unescape(const char * p)1573 char *cg_unescape(const char *p) {
1574         assert(p);
1575 
1576         /* The return value of this function (unlike cg_escape())
1577          * doesn't need free()! */
1578 
1579         if (p[0] == '_')
1580                 return (char*) p+1;
1581 
1582         return (char*) p;
1583 }
1584 
1585 #define CONTROLLER_VALID                        \
1586         DIGITS LETTERS                          \
1587         "_"
1588 
cg_controller_is_valid(const char * p)1589 bool cg_controller_is_valid(const char *p) {
1590         const char *t, *s;
1591 
1592         if (!p)
1593                 return false;
1594 
1595         if (streq(p, SYSTEMD_CGROUP_CONTROLLER))
1596                 return true;
1597 
1598         s = startswith(p, "name=");
1599         if (s)
1600                 p = s;
1601 
1602         if (IN_SET(*p, 0, '_'))
1603                 return false;
1604 
1605         for (t = p; *t; t++)
1606                 if (!strchr(CONTROLLER_VALID, *t))
1607                         return false;
1608 
1609         if (t - p > NAME_MAX)
1610                 return false;
1611 
1612         return true;
1613 }
1614 
cg_slice_to_path(const char * unit,char ** ret)1615 int cg_slice_to_path(const char *unit, char **ret) {
1616         _cleanup_free_ char *p = NULL, *s = NULL, *e = NULL;
1617         const char *dash;
1618         int r;
1619 
1620         assert(unit);
1621         assert(ret);
1622 
1623         if (streq(unit, SPECIAL_ROOT_SLICE)) {
1624                 char *x;
1625 
1626                 x = strdup("");
1627                 if (!x)
1628                         return -ENOMEM;
1629                 *ret = x;
1630                 return 0;
1631         }
1632 
1633         if (!unit_name_is_valid(unit, UNIT_NAME_PLAIN))
1634                 return -EINVAL;
1635 
1636         if (!endswith(unit, ".slice"))
1637                 return -EINVAL;
1638 
1639         r = unit_name_to_prefix(unit, &p);
1640         if (r < 0)
1641                 return r;
1642 
1643         dash = strchr(p, '-');
1644 
1645         /* Don't allow initial dashes */
1646         if (dash == p)
1647                 return -EINVAL;
1648 
1649         while (dash) {
1650                 _cleanup_free_ char *escaped = NULL;
1651                 char n[dash - p + sizeof(".slice")];
1652 
1653 #if HAS_FEATURE_MEMORY_SANITIZER
1654                 /* msan doesn't instrument stpncpy, so it thinks
1655                  * n is later used uninitialized:
1656                  * https://github.com/google/sanitizers/issues/926
1657                  */
1658                 zero(n);
1659 #endif
1660 
1661                 /* Don't allow trailing or double dashes */
1662                 if (IN_SET(dash[1], 0, '-'))
1663                         return -EINVAL;
1664 
1665                 strcpy(stpncpy(n, p, dash - p), ".slice");
1666                 if (!unit_name_is_valid(n, UNIT_NAME_PLAIN))
1667                         return -EINVAL;
1668 
1669                 escaped = cg_escape(n);
1670                 if (!escaped)
1671                         return -ENOMEM;
1672 
1673                 if (!strextend(&s, escaped, "/"))
1674                         return -ENOMEM;
1675 
1676                 dash = strchr(dash+1, '-');
1677         }
1678 
1679         e = cg_escape(unit);
1680         if (!e)
1681                 return -ENOMEM;
1682 
1683         if (!strextend(&s, e))
1684                 return -ENOMEM;
1685 
1686         *ret = TAKE_PTR(s);
1687 
1688         return 0;
1689 }
1690 
cg_is_threaded(const char * controller,const char * path)1691 int cg_is_threaded(const char *controller, const char *path) {
1692         _cleanup_free_ char *fs = NULL, *contents = NULL;
1693         _cleanup_strv_free_ char **v = NULL;
1694         int r;
1695 
1696         r = cg_get_path(controller, path, "cgroup.type", &fs);
1697         if (r < 0)
1698                 return r;
1699 
1700         r = read_full_virtual_file(fs, &contents, NULL);
1701         if (r == -ENOENT)
1702                 return false; /* Assume no. */
1703         if (r < 0)
1704                 return r;
1705 
1706         v = strv_split(contents, NULL);
1707         if (!v)
1708                 return -ENOMEM;
1709 
1710         /* If the cgroup is in the threaded mode, it contains "threaded".
1711          * If one of the parents or siblings is in the threaded mode, it may contain "invalid". */
1712         return strv_contains(v, "threaded") || strv_contains(v, "invalid");
1713 }
1714 
cg_set_attribute(const char * controller,const char * path,const char * attribute,const char * value)1715 int cg_set_attribute(const char *controller, const char *path, const char *attribute, const char *value) {
1716         _cleanup_free_ char *p = NULL;
1717         int r;
1718 
1719         r = cg_get_path(controller, path, attribute, &p);
1720         if (r < 0)
1721                 return r;
1722 
1723         return write_string_file(p, value, WRITE_STRING_FILE_DISABLE_BUFFER);
1724 }
1725 
cg_get_attribute(const char * controller,const char * path,const char * attribute,char ** ret)1726 int cg_get_attribute(const char *controller, const char *path, const char *attribute, char **ret) {
1727         _cleanup_free_ char *p = NULL;
1728         int r;
1729 
1730         r = cg_get_path(controller, path, attribute, &p);
1731         if (r < 0)
1732                 return r;
1733 
1734         return read_one_line_file(p, ret);
1735 }
1736 
cg_get_attribute_as_uint64(const char * controller,const char * path,const char * attribute,uint64_t * ret)1737 int cg_get_attribute_as_uint64(const char *controller, const char *path, const char *attribute, uint64_t *ret) {
1738         _cleanup_free_ char *value = NULL;
1739         uint64_t v;
1740         int r;
1741 
1742         assert(ret);
1743 
1744         r = cg_get_attribute(controller, path, attribute, &value);
1745         if (r == -ENOENT)
1746                 return -ENODATA;
1747         if (r < 0)
1748                 return r;
1749 
1750         if (streq(value, "max")) {
1751                 *ret = CGROUP_LIMIT_MAX;
1752                 return 0;
1753         }
1754 
1755         r = safe_atou64(value, &v);
1756         if (r < 0)
1757                 return r;
1758 
1759         *ret = v;
1760         return 0;
1761 }
1762 
cg_get_attribute_as_bool(const char * controller,const char * path,const char * attribute,bool * ret)1763 int cg_get_attribute_as_bool(const char *controller, const char *path, const char *attribute, bool *ret) {
1764         _cleanup_free_ char *value = NULL;
1765         int r;
1766 
1767         assert(ret);
1768 
1769         r = cg_get_attribute(controller, path, attribute, &value);
1770         if (r == -ENOENT)
1771                 return -ENODATA;
1772         if (r < 0)
1773                 return r;
1774 
1775         r = parse_boolean(value);
1776         if (r < 0)
1777                 return r;
1778 
1779         *ret = r;
1780         return 0;
1781 }
1782 
cg_get_owner(const char * controller,const char * path,uid_t * ret_uid)1783 int cg_get_owner(const char *controller, const char *path, uid_t *ret_uid) {
1784         _cleanup_free_ char *f = NULL;
1785         struct stat stats;
1786         int r;
1787 
1788         assert(ret_uid);
1789 
1790         r = cg_get_path(controller, path, NULL, &f);
1791         if (r < 0)
1792                 return r;
1793 
1794         r = stat(f, &stats);
1795         if (r < 0)
1796                 return -errno;
1797 
1798         *ret_uid = stats.st_uid;
1799         return 0;
1800 }
1801 
cg_get_keyed_attribute_full(const char * controller,const char * path,const char * attribute,char ** keys,char ** ret_values,CGroupKeyMode mode)1802 int cg_get_keyed_attribute_full(
1803                 const char *controller,
1804                 const char *path,
1805                 const char *attribute,
1806                 char **keys,
1807                 char **ret_values,
1808                 CGroupKeyMode mode) {
1809 
1810         _cleanup_free_ char *filename = NULL, *contents = NULL;
1811         const char *p;
1812         size_t n, i, n_done = 0;
1813         char **v;
1814         int r;
1815 
1816         /* Reads one or more fields of a cgroup v2 keyed attribute file. The 'keys' parameter should be an strv with
1817          * all keys to retrieve. The 'ret_values' parameter should be passed as string size with the same number of
1818          * entries as 'keys'. On success each entry will be set to the value of the matching key.
1819          *
1820          * If the attribute file doesn't exist at all returns ENOENT, if any key is not found returns ENXIO. If mode
1821          * is set to GG_KEY_MODE_GRACEFUL we ignore missing keys and return those that were parsed successfully. */
1822 
1823         r = cg_get_path(controller, path, attribute, &filename);
1824         if (r < 0)
1825                 return r;
1826 
1827         r = read_full_file(filename, &contents, NULL);
1828         if (r < 0)
1829                 return r;
1830 
1831         n = strv_length(keys);
1832         if (n == 0) /* No keys to retrieve? That's easy, we are done then */
1833                 return 0;
1834 
1835         /* Let's build this up in a temporary array for now in order not to clobber the return parameter on failure */
1836         v = newa0(char*, n);
1837 
1838         for (p = contents; *p;) {
1839                 const char *w = NULL;
1840 
1841                 for (i = 0; i < n; i++)
1842                         if (!v[i]) {
1843                                 w = first_word(p, keys[i]);
1844                                 if (w)
1845                                         break;
1846                         }
1847 
1848                 if (w) {
1849                         size_t l;
1850 
1851                         l = strcspn(w, NEWLINE);
1852                         v[i] = strndup(w, l);
1853                         if (!v[i]) {
1854                                 r = -ENOMEM;
1855                                 goto fail;
1856                         }
1857 
1858                         n_done++;
1859                         if (n_done >= n)
1860                                 goto done;
1861 
1862                         p = w + l;
1863                 } else
1864                         p += strcspn(p, NEWLINE);
1865 
1866                 p += strspn(p, NEWLINE);
1867         }
1868 
1869         if (mode & CG_KEY_MODE_GRACEFUL)
1870                 goto done;
1871 
1872         r = -ENXIO;
1873 
1874 fail:
1875         for (i = 0; i < n; i++)
1876                 free(v[i]);
1877 
1878         return r;
1879 
1880 done:
1881         memcpy(ret_values, v, sizeof(char*) * n);
1882         if (mode & CG_KEY_MODE_GRACEFUL)
1883                 return n_done;
1884 
1885         return 0;
1886 }
1887 
cg_mask_to_string(CGroupMask mask,char ** ret)1888 int cg_mask_to_string(CGroupMask mask, char **ret) {
1889         _cleanup_free_ char *s = NULL;
1890         bool space = false;
1891         CGroupController c;
1892         size_t n = 0;
1893 
1894         assert(ret);
1895 
1896         if (mask == 0) {
1897                 *ret = NULL;
1898                 return 0;
1899         }
1900 
1901         for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1902                 const char *k;
1903                 size_t l;
1904 
1905                 if (!FLAGS_SET(mask, CGROUP_CONTROLLER_TO_MASK(c)))
1906                         continue;
1907 
1908                 k = cgroup_controller_to_string(c);
1909                 l = strlen(k);
1910 
1911                 if (!GREEDY_REALLOC(s, n + space + l + 1))
1912                         return -ENOMEM;
1913 
1914                 if (space)
1915                         s[n] = ' ';
1916                 memcpy(s + n + space, k, l);
1917                 n += space + l;
1918 
1919                 space = true;
1920         }
1921 
1922         assert(s);
1923 
1924         s[n] = 0;
1925         *ret = TAKE_PTR(s);
1926 
1927         return 0;
1928 }
1929 
cg_mask_from_string(const char * value,CGroupMask * ret)1930 int cg_mask_from_string(const char *value, CGroupMask *ret) {
1931         CGroupMask m = 0;
1932 
1933         assert(ret);
1934         assert(value);
1935 
1936         for (;;) {
1937                 _cleanup_free_ char *n = NULL;
1938                 CGroupController v;
1939                 int r;
1940 
1941                 r = extract_first_word(&value, &n, NULL, 0);
1942                 if (r < 0)
1943                         return r;
1944                 if (r == 0)
1945                         break;
1946 
1947                 v = cgroup_controller_from_string(n);
1948                 if (v < 0)
1949                         continue;
1950 
1951                 m |= CGROUP_CONTROLLER_TO_MASK(v);
1952         }
1953 
1954         *ret = m;
1955         return 0;
1956 }
1957 
cg_mask_supported_subtree(const char * root,CGroupMask * ret)1958 int cg_mask_supported_subtree(const char *root, CGroupMask *ret) {
1959         CGroupMask mask;
1960         int r;
1961 
1962         /* Determines the mask of supported cgroup controllers. Only includes controllers we can make sense of and that
1963          * are actually accessible. Only covers real controllers, i.e. not the CGROUP_CONTROLLER_BPF_xyz
1964          * pseudo-controllers. */
1965 
1966         r = cg_all_unified();
1967         if (r < 0)
1968                 return r;
1969         if (r > 0) {
1970                 _cleanup_free_ char *controllers = NULL, *path = NULL;
1971 
1972                 /* In the unified hierarchy we can read the supported and accessible controllers from
1973                  * the top-level cgroup attribute */
1974 
1975                 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, root, "cgroup.controllers", &path);
1976                 if (r < 0)
1977                         return r;
1978 
1979                 r = read_one_line_file(path, &controllers);
1980                 if (r < 0)
1981                         return r;
1982 
1983                 r = cg_mask_from_string(controllers, &mask);
1984                 if (r < 0)
1985                         return r;
1986 
1987                 /* Mask controllers that are not supported in unified hierarchy. */
1988                 mask &= CGROUP_MASK_V2;
1989 
1990         } else {
1991                 CGroupController c;
1992 
1993                 /* In the legacy hierarchy, we check which hierarchies are accessible. */
1994 
1995                 mask = 0;
1996                 for (c = 0; c < _CGROUP_CONTROLLER_MAX; c++) {
1997                         CGroupMask bit = CGROUP_CONTROLLER_TO_MASK(c);
1998                         const char *n;
1999 
2000                         if (!FLAGS_SET(CGROUP_MASK_V1, bit))
2001                                 continue;
2002 
2003                         n = cgroup_controller_to_string(c);
2004                         if (controller_is_v1_accessible(root, n) >= 0)
2005                                 mask |= bit;
2006                 }
2007         }
2008 
2009         *ret = mask;
2010         return 0;
2011 }
2012 
cg_mask_supported(CGroupMask * ret)2013 int cg_mask_supported(CGroupMask *ret) {
2014         _cleanup_free_ char *root = NULL;
2015         int r;
2016 
2017         r = cg_get_root_path(&root);
2018         if (r < 0)
2019                 return r;
2020 
2021         return cg_mask_supported_subtree(root, ret);
2022 }
2023 
cg_kernel_controllers(Set ** ret)2024 int cg_kernel_controllers(Set **ret) {
2025         _cleanup_set_free_ Set *controllers = NULL;
2026         _cleanup_fclose_ FILE *f = NULL;
2027         int r;
2028 
2029         assert(ret);
2030 
2031         /* Determines the full list of kernel-known controllers. Might include controllers we don't actually support
2032          * and controllers that aren't currently accessible (because not mounted). This does not include "name="
2033          * pseudo-controllers. */
2034 
2035         r = fopen_unlocked("/proc/cgroups", "re", &f);
2036         if (r == -ENOENT) {
2037                 *ret = NULL;
2038                 return 0;
2039         }
2040         if (r < 0)
2041                 return r;
2042 
2043         /* Ignore the header line */
2044         (void) read_line(f, SIZE_MAX, NULL);
2045 
2046         for (;;) {
2047                 _cleanup_free_ char *controller = NULL;
2048                 int enabled = 0;
2049 
2050                 errno = 0;
2051                 if (fscanf(f, "%ms %*i %*i %i", &controller, &enabled) != 2) {
2052 
2053                         if (feof(f))
2054                                 break;
2055 
2056                         if (ferror(f))
2057                                 return errno_or_else(EIO);
2058 
2059                         return -EBADMSG;
2060                 }
2061 
2062                 if (!enabled)
2063                         continue;
2064 
2065                 if (!cg_controller_is_valid(controller))
2066                         return -EBADMSG;
2067 
2068                 r = set_ensure_consume(&controllers, &string_hash_ops_free, TAKE_PTR(controller));
2069                 if (r < 0)
2070                         return r;
2071         }
2072 
2073         *ret = TAKE_PTR(controllers);
2074 
2075         return 0;
2076 }
2077 
2078 /* The hybrid mode was initially implemented in v232 and simply mounted cgroup2 on
2079  * /sys/fs/cgroup/systemd. This unfortunately broke other tools (such as docker) which expected the v1
2080  * "name=systemd" hierarchy on /sys/fs/cgroup/systemd. From v233 and on, the hybrid mode mounts v2 on
2081  * /sys/fs/cgroup/unified and maintains "name=systemd" hierarchy on /sys/fs/cgroup/systemd for compatibility
2082  * with other tools.
2083  *
2084  * To keep live upgrade working, we detect and support v232 layout. When v232 layout is detected, to keep
2085  * cgroup v2 process management but disable the compat dual layout, we return true on
2086  * cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER) and false on cg_hybrid_unified().
2087  */
2088 static thread_local bool unified_systemd_v232;
2089 
cg_unified_cached(bool flush)2090 int cg_unified_cached(bool flush) {
2091         static thread_local CGroupUnified unified_cache = CGROUP_UNIFIED_UNKNOWN;
2092 
2093         struct statfs fs;
2094 
2095         /* Checks if we support the unified hierarchy. Returns an
2096          * error when the cgroup hierarchies aren't mounted yet or we
2097          * have any other trouble determining if the unified hierarchy
2098          * is supported. */
2099 
2100         if (flush)
2101                 unified_cache = CGROUP_UNIFIED_UNKNOWN;
2102         else if (unified_cache >= CGROUP_UNIFIED_NONE)
2103                 return unified_cache;
2104 
2105         if (statfs("/sys/fs/cgroup/", &fs) < 0)
2106                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/\") failed: %m");
2107 
2108         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2109                 log_debug("Found cgroup2 on /sys/fs/cgroup/, full unified hierarchy");
2110                 unified_cache = CGROUP_UNIFIED_ALL;
2111         } else if (F_TYPE_EQUAL(fs.f_type, TMPFS_MAGIC)) {
2112                 if (statfs("/sys/fs/cgroup/unified/", &fs) == 0 &&
2113                     F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2114                         log_debug("Found cgroup2 on /sys/fs/cgroup/unified, unified hierarchy for systemd controller");
2115                         unified_cache = CGROUP_UNIFIED_SYSTEMD;
2116                         unified_systemd_v232 = false;
2117                 } else {
2118                         if (statfs("/sys/fs/cgroup/systemd/", &fs) < 0) {
2119                                 if (errno == ENOENT) {
2120                                         /* Some other software may have set up /sys/fs/cgroup in a configuration we do not recognize. */
2121                                         log_debug_errno(errno, "Unsupported cgroupsv1 setup detected: name=systemd hierarchy not found.");
2122                                         return -ENOMEDIUM;
2123                                 }
2124                                 return log_debug_errno(errno, "statfs(\"/sys/fs/cgroup/systemd\" failed: %m");
2125                         }
2126 
2127                         if (F_TYPE_EQUAL(fs.f_type, CGROUP2_SUPER_MAGIC)) {
2128                                 log_debug("Found cgroup2 on /sys/fs/cgroup/systemd, unified hierarchy for systemd controller (v232 variant)");
2129                                 unified_cache = CGROUP_UNIFIED_SYSTEMD;
2130                                 unified_systemd_v232 = true;
2131                         } else if (F_TYPE_EQUAL(fs.f_type, CGROUP_SUPER_MAGIC)) {
2132                                 log_debug("Found cgroup on /sys/fs/cgroup/systemd, legacy hierarchy");
2133                                 unified_cache = CGROUP_UNIFIED_NONE;
2134                         } else {
2135                                 log_debug("Unexpected filesystem type %llx mounted on /sys/fs/cgroup/systemd, assuming legacy hierarchy",
2136                                           (unsigned long long) fs.f_type);
2137                                 unified_cache = CGROUP_UNIFIED_NONE;
2138                         }
2139                 }
2140         } else if (F_TYPE_EQUAL(fs.f_type, SYSFS_MAGIC)) {
2141                 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2142                                        "No filesystem is currently mounted on /sys/fs/cgroup.");
2143         } else
2144                 return log_debug_errno(SYNTHETIC_ERRNO(ENOMEDIUM),
2145                                        "Unknown filesystem type %llx mounted on /sys/fs/cgroup.",
2146                                        (unsigned long long)fs.f_type);
2147 
2148         return unified_cache;
2149 }
2150 
cg_unified_controller(const char * controller)2151 int cg_unified_controller(const char *controller) {
2152         int r;
2153 
2154         r = cg_unified_cached(false);
2155         if (r < 0)
2156                 return r;
2157 
2158         if (r == CGROUP_UNIFIED_NONE)
2159                 return false;
2160 
2161         if (r >= CGROUP_UNIFIED_ALL)
2162                 return true;
2163 
2164         return streq_ptr(controller, SYSTEMD_CGROUP_CONTROLLER);
2165 }
2166 
cg_all_unified(void)2167 int cg_all_unified(void) {
2168         int r;
2169 
2170         r = cg_unified_cached(false);
2171         if (r < 0)
2172                 return r;
2173 
2174         return r >= CGROUP_UNIFIED_ALL;
2175 }
2176 
cg_hybrid_unified(void)2177 int cg_hybrid_unified(void) {
2178         int r;
2179 
2180         r = cg_unified_cached(false);
2181         if (r < 0)
2182                 return r;
2183 
2184         return r == CGROUP_UNIFIED_SYSTEMD && !unified_systemd_v232;
2185 }
2186 
2187 const uint64_t cgroup_io_limit_defaults[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2188         [CGROUP_IO_RBPS_MAX]    = CGROUP_LIMIT_MAX,
2189         [CGROUP_IO_WBPS_MAX]    = CGROUP_LIMIT_MAX,
2190         [CGROUP_IO_RIOPS_MAX]   = CGROUP_LIMIT_MAX,
2191         [CGROUP_IO_WIOPS_MAX]   = CGROUP_LIMIT_MAX,
2192 };
2193 
2194 static const char* const cgroup_io_limit_type_table[_CGROUP_IO_LIMIT_TYPE_MAX] = {
2195         [CGROUP_IO_RBPS_MAX]    = "IOReadBandwidthMax",
2196         [CGROUP_IO_WBPS_MAX]    = "IOWriteBandwidthMax",
2197         [CGROUP_IO_RIOPS_MAX]   = "IOReadIOPSMax",
2198         [CGROUP_IO_WIOPS_MAX]   = "IOWriteIOPSMax",
2199 };
2200 
2201 DEFINE_STRING_TABLE_LOOKUP(cgroup_io_limit_type, CGroupIOLimitType);
2202 
is_cgroup_fs(const struct statfs * s)2203 bool is_cgroup_fs(const struct statfs *s) {
2204         return is_fs_type(s, CGROUP_SUPER_MAGIC) ||
2205                is_fs_type(s, CGROUP2_SUPER_MAGIC);
2206 }
2207 
fd_is_cgroup_fs(int fd)2208 bool fd_is_cgroup_fs(int fd) {
2209         struct statfs s;
2210 
2211         if (fstatfs(fd, &s) < 0)
2212                 return -errno;
2213 
2214         return is_cgroup_fs(&s);
2215 }
2216 
2217 static const char *const cgroup_controller_table[_CGROUP_CONTROLLER_MAX] = {
2218         [CGROUP_CONTROLLER_CPU] = "cpu",
2219         [CGROUP_CONTROLLER_CPUACCT] = "cpuacct",
2220         [CGROUP_CONTROLLER_CPUSET] = "cpuset",
2221         [CGROUP_CONTROLLER_IO] = "io",
2222         [CGROUP_CONTROLLER_BLKIO] = "blkio",
2223         [CGROUP_CONTROLLER_MEMORY] = "memory",
2224         [CGROUP_CONTROLLER_DEVICES] = "devices",
2225         [CGROUP_CONTROLLER_PIDS] = "pids",
2226         [CGROUP_CONTROLLER_BPF_FIREWALL] = "bpf-firewall",
2227         [CGROUP_CONTROLLER_BPF_DEVICES] = "bpf-devices",
2228         [CGROUP_CONTROLLER_BPF_FOREIGN] = "bpf-foreign",
2229         [CGROUP_CONTROLLER_BPF_SOCKET_BIND] = "bpf-socket-bind",
2230         [CGROUP_CONTROLLER_BPF_RESTRICT_NETWORK_INTERFACES] = "bpf-restrict-network-interfaces",
2231 };
2232 
2233 DEFINE_STRING_TABLE_LOOKUP(cgroup_controller, CGroupController);
2234 
get_cpu_accounting_mask(void)2235 CGroupMask get_cpu_accounting_mask(void) {
2236         static CGroupMask needed_mask = (CGroupMask) -1;
2237 
2238         /* On kernel ≥4.15 with unified hierarchy, cpu.stat's usage_usec is
2239          * provided externally from the CPU controller, which means we don't
2240          * need to enable the CPU controller just to get metrics. This is good,
2241          * because enabling the CPU controller comes at a minor performance
2242          * hit, especially when it's propagated deep into large hierarchies.
2243          * There's also no separate CPU accounting controller available within
2244          * a unified hierarchy.
2245          *
2246          * This combination of factors results in the desired cgroup mask to
2247          * enable for CPU accounting varying as follows:
2248          *
2249          *                   ╔═════════════════════╤═════════════════════╗
2250          *                   ║     Linux ≥4.15     │     Linux <4.15     ║
2251          *   ╔═══════════════╬═════════════════════╪═════════════════════╣
2252          *   ║ Unified       ║ nothing             │ CGROUP_MASK_CPU     ║
2253          *   ╟───────────────╫─────────────────────┼─────────────────────╢
2254          *   ║ Hybrid/Legacy ║ CGROUP_MASK_CPUACCT │ CGROUP_MASK_CPUACCT ║
2255          *   ╚═══════════════╩═════════════════════╧═════════════════════╝
2256          *
2257          * We check kernel version here instead of manually checking whether
2258          * cpu.stat is present for every cgroup, as that check in itself would
2259          * already be fairly expensive.
2260          *
2261          * Kernels where this patch has been backported will therefore have the
2262          * CPU controller enabled unnecessarily. This is more expensive than
2263          * necessary, but harmless. ☺️
2264          */
2265 
2266         if (needed_mask == (CGroupMask) -1) {
2267                 if (cg_all_unified()) {
2268                         struct utsname u;
2269                         assert_se(uname(&u) >= 0);
2270 
2271                         if (strverscmp_improved(u.release, "4.15") < 0)
2272                                 needed_mask = CGROUP_MASK_CPU;
2273                         else
2274                                 needed_mask = 0;
2275                 } else
2276                         needed_mask = CGROUP_MASK_CPUACCT;
2277         }
2278 
2279         return needed_mask;
2280 }
2281 
cpu_accounting_is_cheap(void)2282 bool cpu_accounting_is_cheap(void) {
2283         return get_cpu_accounting_mask() == 0;
2284 }
2285 
2286 static const char* const managed_oom_mode_table[_MANAGED_OOM_MODE_MAX] = {
2287         [MANAGED_OOM_AUTO] = "auto",
2288         [MANAGED_OOM_KILL] = "kill",
2289 };
2290 
2291 DEFINE_STRING_TABLE_LOOKUP(managed_oom_mode, ManagedOOMMode);
2292 
2293 static const char* const managed_oom_preference_table[_MANAGED_OOM_PREFERENCE_MAX] = {
2294         [MANAGED_OOM_PREFERENCE_NONE] = "none",
2295         [MANAGED_OOM_PREFERENCE_AVOID] = "avoid",
2296         [MANAGED_OOM_PREFERENCE_OMIT] = "omit",
2297 };
2298 
2299 DEFINE_STRING_TABLE_LOOKUP(managed_oom_preference, ManagedOOMPreference);
2300