1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include "sd-daemon.h"
4 
5 #include "bus-log-control-api.h"
6 #include "bus-util.h"
7 #include "bus-polkit.h"
8 #include "cgroup-util.h"
9 #include "fd-util.h"
10 #include "fileio.h"
11 #include "format-util.h"
12 #include "memory-util.h"
13 #include "oomd-manager-bus.h"
14 #include "oomd-manager.h"
15 #include "path-util.h"
16 #include "percent-util.h"
17 
18 typedef struct ManagedOOMMessage {
19         ManagedOOMMode mode;
20         char *path;
21         char *property;
22         uint32_t limit;
23 } ManagedOOMMessage;
24 
managed_oom_message_destroy(ManagedOOMMessage * message)25 static void managed_oom_message_destroy(ManagedOOMMessage *message) {
26         assert(message);
27         free(message->path);
28         free(message->property);
29 }
30 
managed_oom_mode(const char * name,JsonVariant * v,JsonDispatchFlags flags,void * userdata)31 static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
32         ManagedOOMMode *mode = userdata, m;
33         const char *s;
34 
35         assert(mode);
36         assert_se(s = json_variant_string(v));
37 
38         m = managed_oom_mode_from_string(s);
39         if (m < 0)
40                 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
41 
42         *mode = m;
43         return 0;
44 }
45 
process_managed_oom_message(Manager * m,uid_t uid,JsonVariant * parameters)46 static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) {
47         JsonVariant *c, *cgroups;
48         int r;
49 
50         static const JsonDispatch dispatch_table[] = {
51                 { "mode",     JSON_VARIANT_STRING,   managed_oom_mode,     offsetof(ManagedOOMMessage, mode),     JSON_MANDATORY },
52                 { "path",     JSON_VARIANT_STRING,   json_dispatch_string, offsetof(ManagedOOMMessage, path),     JSON_MANDATORY },
53                 { "property", JSON_VARIANT_STRING,   json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY },
54                 { "limit",    JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit),    0 },
55                 {},
56         };
57 
58         assert(m);
59         assert(parameters);
60 
61         cgroups = json_variant_by_key(parameters, "cgroups");
62         if (!cgroups)
63                 return -EINVAL;
64 
65         /* Skip malformed elements and keep processing in case the others are good */
66         JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
67                 _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {};
68                 OomdCGroupContext *ctx;
69                 Hashmap *monitor_hm;
70                 loadavg_t limit;
71 
72                 if (!json_variant_is_object(c))
73                         continue;
74 
75                 r = json_dispatch(c, dispatch_table, NULL, 0, &message);
76                 if (r == -ENOMEM)
77                         return r;
78                 if (r < 0)
79                         continue;
80 
81                 if (uid != 0) {
82                         uid_t cg_uid;
83 
84                         r = cg_path_get_owner_uid(message.path, &cg_uid);
85                         if (r < 0) {
86                                 log_debug("Failed to get cgroup %s owner uid: %m", message.path);
87                                 continue;
88                         }
89 
90                         /* Let's not be lenient for permission errors and skip processing if we receive an
91                         * update for a cgroup that doesn't belong to the user. */
92                         if (uid != cg_uid)
93                                 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
94                                                        "cgroup path owner UID does not match sender uid "
95                                                        "(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
96                 }
97 
98                 monitor_hm = streq(message.property, "ManagedOOMSwap") ?
99                                 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
100 
101                 if (message.mode == MANAGED_OOM_AUTO) {
102                         (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path)));
103                         continue;
104                 }
105 
106                 limit = m->default_mem_pressure_limit;
107 
108                 if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) {
109                         int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit);
110 
111                         r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit);
112                         if (r < 0)
113                                 continue;
114                 }
115 
116                 r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
117                 if (r == -ENOMEM)
118                         return r;
119                 if (r < 0 && r != -EEXIST)
120                         log_debug_errno(r, "Failed to insert message, ignoring: %m");
121 
122                 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
123                  * ignored so always updating it here is not a problem. */
124                 ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
125                 if (ctx)
126                         ctx->mem_pressure_limit = limit;
127         }
128 
129         return 0;
130 }
131 
process_managed_oom_request(Varlink * link,JsonVariant * parameters,VarlinkMethodFlags flags,void * userdata)132 static int process_managed_oom_request(
133                 Varlink *link,
134                 JsonVariant *parameters,
135                 VarlinkMethodFlags flags,
136                 void *userdata) {
137         Manager *m = userdata;
138         uid_t uid;
139         int r;
140 
141         assert(m);
142 
143         r = varlink_get_peer_uid(link, &uid);
144         if (r < 0)
145                 return log_error_errno(r, "Failed to get varlink peer uid: %m");
146 
147         return process_managed_oom_message(m, uid, parameters);
148 }
149 
process_managed_oom_reply(Varlink * link,JsonVariant * parameters,const char * error_id,VarlinkReplyFlags flags,void * userdata)150 static int process_managed_oom_reply(
151                 Varlink *link,
152                 JsonVariant *parameters,
153                 const char *error_id,
154                 VarlinkReplyFlags flags,
155                 void *userdata) {
156         Manager *m = userdata;
157         uid_t uid;
158         int r;
159 
160         assert(m);
161 
162         if (error_id) {
163                 r = -EIO;
164                 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
165                 goto finish;
166         }
167 
168         r = varlink_get_peer_uid(link, &uid);
169         if (r < 0) {
170                 log_error_errno(r, "Failed to get varlink peer uid: %m");
171                 goto finish;
172         }
173 
174         r = process_managed_oom_message(m, uid, parameters);
175 
176 finish:
177         if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
178                 m->varlink_client = varlink_close_unref(link);
179 
180         return r;
181 }
182 
183 /* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible
184  * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
185  *
186  * This function ignores most errors in order to handle cgroups that may have been cleaned up while
187  * populating the hashmap.
188  *
189  * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */
recursively_get_cgroup_context(Hashmap * new_h,const char * path)190 static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
191         _cleanup_free_ char *subpath = NULL;
192         _cleanup_closedir_ DIR *d = NULL;
193         int r;
194 
195         assert(new_h);
196         assert(path);
197 
198         r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
199         if (r < 0)
200                 return r;
201 
202         r = cg_read_subgroup(d, &subpath);
203         if (r < 0)
204                 return r;
205         else if (r == 0) { /* No subgroups? We're a leaf node */
206                 r = oomd_insert_cgroup_context(NULL, new_h, path);
207                 if (r == -ENOMEM)
208                         return r;
209                 if (r < 0)
210                         log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
211                 return 0;
212         }
213 
214         do {
215                 _cleanup_free_ char *cg_path = NULL;
216                 bool oom_group;
217 
218                 cg_path = path_join(empty_to_root(path), subpath);
219                 if (!cg_path)
220                         return -ENOMEM;
221 
222                 subpath = mfree(subpath);
223 
224                 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
225                 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
226                 if (r == -ENOMEM)
227                         return r;
228                 if (r < 0) {
229                         log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
230                         return 0;
231                 }
232 
233                 if (oom_group)
234                         r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
235                 else
236                         r = recursively_get_cgroup_context(new_h, cg_path);
237                 if (r == -ENOMEM)
238                         return r;
239                 if (r < 0)
240                         log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
241         } while ((r = cg_read_subgroup(d, &subpath)) > 0);
242 
243         return 0;
244 }
245 
update_monitored_cgroup_contexts(Hashmap ** monitored_cgroups)246 static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
247         _cleanup_hashmap_free_ Hashmap *new_base = NULL;
248         OomdCGroupContext *ctx;
249         int r;
250 
251         assert(monitored_cgroups);
252 
253         new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
254         if (!new_base)
255                 return -ENOMEM;
256 
257         HASHMAP_FOREACH(ctx, *monitored_cgroups) {
258                 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
259                 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
260                 if (r == -ENOMEM)
261                         return r;
262                 if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
263                         log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
264         }
265 
266         hashmap_free(*monitored_cgroups);
267         *monitored_cgroups = TAKE_PTR(new_base);
268 
269         return 0;
270 }
271 
get_monitored_cgroup_contexts_candidates(Hashmap * monitored_cgroups,Hashmap ** ret_candidates)272 static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
273         _cleanup_hashmap_free_ Hashmap *candidates = NULL;
274         OomdCGroupContext *ctx;
275         int r;
276 
277         assert(monitored_cgroups);
278         assert(ret_candidates);
279 
280         candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
281         if (!candidates)
282                 return -ENOMEM;
283 
284         HASHMAP_FOREACH(ctx, monitored_cgroups) {
285                 r = recursively_get_cgroup_context(candidates, ctx->path);
286                 if (r == -ENOMEM)
287                         return r;
288                 if (r < 0)
289                         log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
290         }
291 
292         *ret_candidates = TAKE_PTR(candidates);
293 
294         return 0;
295 }
296 
update_monitored_cgroup_contexts_candidates(Hashmap * monitored_cgroups,Hashmap ** candidates)297 static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
298         _cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
299         int r;
300 
301         assert(monitored_cgroups);
302         assert(candidates);
303         assert(*candidates);
304 
305         r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
306         if (r < 0)
307                 return log_debug_errno(r, "Failed to get candidate contexts: %m");
308 
309         oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
310 
311         hashmap_free(*candidates);
312         *candidates = TAKE_PTR(new_candidates);
313 
314         return 0;
315 }
316 
acquire_managed_oom_connect(Manager * m)317 static int acquire_managed_oom_connect(Manager *m) {
318         _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
319         int r;
320 
321         assert(m);
322         assert(m->event);
323 
324         r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM);
325         if (r < 0)
326                 return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m");
327 
328         (void) varlink_set_userdata(link, m);
329         (void) varlink_set_description(link, "oomd");
330         (void) varlink_set_relative_timeout(link, USEC_INFINITY);
331 
332         r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
333         if (r < 0)
334                 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
335 
336         r = varlink_bind_reply(link, process_managed_oom_reply);
337         if (r < 0)
338                 return log_error_errno(r, "Failed to bind reply callback: %m");
339 
340         r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
341         if (r < 0)
342                 return log_error_errno(r, "Failed to observe varlink call: %m");
343 
344         m->varlink_client = TAKE_PTR(link);
345         return 0;
346 }
347 
monitor_swap_contexts_handler(sd_event_source * s,uint64_t usec,void * userdata)348 static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
349         Manager *m = userdata;
350         usec_t usec_now;
351         int r;
352 
353         assert(s);
354         assert(userdata);
355 
356         /* Reset timer */
357         r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
358         if (r < 0)
359                 return log_error_errno(r, "Failed to reset event timer: %m");
360 
361         r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
362         if (r < 0)
363                 return log_error_errno(r, "Failed to set relative time for timer: %m");
364 
365         /* Reconnect if our connection dropped */
366         if (!m->varlink_client) {
367                 r = acquire_managed_oom_connect(m);
368                 if (r < 0)
369                         return log_error_errno(r, "Failed to acquire varlink connection: %m");
370         }
371 
372         /* We still try to acquire system information for oomctl even if no units want swap monitoring */
373         r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
374         /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */
375         if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
376                 return log_error_errno(r, "Failed to acquire system context: %m");
377 
378         /* Return early if nothing is requesting swap monitoring */
379         if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
380                 return 0;
381 
382         /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
383          * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
384          * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
385          * nodes are the ones that matter). */
386 
387         /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */
388         if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
389                         oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
390                 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
391                 _cleanup_free_ char *selected = NULL;
392                 uint64_t threshold;
393 
394                 log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
395                           "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
396                           m->system_context.mem_used, m->system_context.mem_total,
397                           m->system_context.swap_used, m->system_context.swap_total,
398                           PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
399 
400                 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
401                 if (r == -ENOMEM)
402                         return log_oom();
403                 if (r < 0)
404                         log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
405 
406                 threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
407                 r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
408                 if (r == -ENOMEM)
409                         return log_oom();
410                 if (r < 0)
411                         log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
412                 else {
413                         if (selected && r > 0)
414                                 log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
415                                            "swap used (%"PRIu64") / total (%"PRIu64") being more than "
416                                            PERMYRIAD_AS_PERCENT_FORMAT_STR,
417                                            selected,
418                                            m->system_context.mem_used, m->system_context.mem_total,
419                                            m->system_context.swap_used, m->system_context.swap_total,
420                                            PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
421                         return 0;
422                 }
423         }
424 
425         return 0;
426 }
427 
clear_candidate_hashmapp(Manager ** m)428 static void clear_candidate_hashmapp(Manager **m) {
429         if (*m)
430                 hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
431 }
432 
monitor_memory_pressure_contexts_handler(sd_event_source * s,uint64_t usec,void * userdata)433 static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
434         /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
435          * update the candidate data (in which case clear_candidates will be NULL). */
436         _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
437         _cleanup_set_free_ Set *targets = NULL;
438         bool in_post_action_delay = false;
439         Manager *m = userdata;
440         usec_t usec_now;
441         int r;
442 
443         assert(s);
444         assert(userdata);
445 
446         /* Reset timer */
447         r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
448         if (r < 0)
449                 return log_error_errno(r, "Failed to reset event timer: %m");
450 
451         r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
452         if (r < 0)
453                 return log_error_errno(r, "Failed to set relative time for timer: %m");
454 
455         /* Reconnect if our connection dropped */
456         if (!m->varlink_client) {
457                 r = acquire_managed_oom_connect(m);
458                 if (r < 0)
459                         return log_error_errno(r, "Failed to acquire varlink connection: %m");
460         }
461 
462         /* Return early if nothing is requesting memory pressure monitoring */
463         if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
464                 return 0;
465 
466         /* Update the cgroups used for detection/action */
467         r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
468         if (r == -ENOMEM)
469                 return log_oom();
470         if (r < 0)
471                 log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
472 
473         /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
474          * values and go on a kill storm. */
475         if (m->mem_pressure_post_action_delay_start > 0) {
476                 if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
477                         in_post_action_delay = true;
478                 else
479                         m->mem_pressure_post_action_delay_start = 0;
480         }
481 
482         r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
483         if (r == -ENOMEM)
484                 return log_oom();
485         if (r < 0)
486                 log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
487         else if (r == 1 && !in_post_action_delay) {
488                 OomdCGroupContext *t;
489                 SET_FOREACH(t, targets) {
490                         _cleanup_free_ char *selected = NULL;
491 
492                         /* Check if there was reclaim activity in the given interval. The concern is the following case:
493                          * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
494                          * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
495                          * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
496                          * to kill something (it won't help anyways). */
497                         if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
498                                 continue;
499 
500                         log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
501                                   t->path,
502                                   LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
503                                   LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
504                                   FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
505 
506                         r = update_monitored_cgroup_contexts_candidates(
507                                         m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
508                         if (r == -ENOMEM)
509                                 return log_oom();
510                         if (r < 0)
511                                 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
512                         else
513                                 clear_candidates = NULL;
514 
515                         r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
516                         if (r == -ENOMEM)
517                                 return log_oom();
518                         if (r < 0)
519                                 log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
520                         else {
521                                 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
522                                  * If r == 0 then it means there were not eligible candidates, the candidate cgroup
523                                  * disappeared, or the candidate cgroup has no processes by the time we tried to kill
524                                  * it. In either case, go through the event loop again and select a new candidate if
525                                  * pressure is still high. */
526                                 m->mem_pressure_post_action_delay_start = usec_now;
527                                 if (selected && r > 0)
528                                         log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
529                                                    " for > %s with reclaim activity",
530                                                    selected, t->path,
531                                                    LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
532                                                    LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
533                                                    FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
534                                 return 0;
535                         }
536                 }
537         } else {
538                 /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
539                  * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
540                  * might happen.
541                  * Candidate cgroup data will continue to get updated during the post-action delay period in case
542                  * pressure continues to be high after a kill. */
543                 OomdCGroupContext *c;
544                 HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
545                         if (c->mem_pressure_limit_hit_start == 0)
546                                 continue;
547 
548                         r = update_monitored_cgroup_contexts_candidates(
549                                         m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
550                         if (r == -ENOMEM)
551                                 return log_oom();
552                         if (r < 0)
553                                 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
554                         else {
555                                 clear_candidates = NULL;
556                                 break;
557                         }
558                 }
559         }
560 
561         return 0;
562 }
563 
monitor_swap_contexts(Manager * m)564 static int monitor_swap_contexts(Manager *m) {
565         _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
566         int r;
567 
568         assert(m);
569         assert(m->event);
570 
571         r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
572         if (r < 0)
573                 return r;
574 
575         r = sd_event_source_set_exit_on_failure(s, true);
576         if (r < 0)
577                 return r;
578 
579         r = sd_event_source_set_enabled(s, SD_EVENT_ON);
580         if (r < 0)
581                 return r;
582 
583         (void) sd_event_source_set_description(s, "oomd-swap-timer");
584 
585         m->swap_context_event_source = TAKE_PTR(s);
586         return 0;
587 }
588 
monitor_memory_pressure_contexts(Manager * m)589 static int monitor_memory_pressure_contexts(Manager *m) {
590         _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
591         int r;
592 
593         assert(m);
594         assert(m->event);
595 
596         r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
597         if (r < 0)
598                 return r;
599 
600         r = sd_event_source_set_exit_on_failure(s, true);
601         if (r < 0)
602                 return r;
603 
604         r = sd_event_source_set_enabled(s, SD_EVENT_ON);
605         if (r < 0)
606                 return r;
607 
608         (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
609 
610         m->mem_pressure_context_event_source = TAKE_PTR(s);
611         return 0;
612 }
613 
manager_free(Manager * m)614 Manager* manager_free(Manager *m) {
615         assert(m);
616 
617         varlink_server_unref(m->varlink_server);
618         varlink_close_unref(m->varlink_client);
619         sd_event_source_unref(m->swap_context_event_source);
620         sd_event_source_unref(m->mem_pressure_context_event_source);
621         sd_event_unref(m->event);
622 
623         bus_verify_polkit_async_registry_free(m->polkit_registry);
624         sd_bus_flush_close_unref(m->bus);
625 
626         hashmap_free(m->monitored_swap_cgroup_contexts);
627         hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
628         hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
629 
630         return mfree(m);
631 }
632 
manager_new(Manager ** ret)633 int manager_new(Manager **ret) {
634         _cleanup_(manager_freep) Manager *m = NULL;
635         int r;
636 
637         assert(ret);
638 
639         m = new0(Manager, 1);
640         if (!m)
641                 return -ENOMEM;
642 
643         r = sd_event_default(&m->event);
644         if (r < 0)
645                 return r;
646 
647         (void) sd_event_set_watchdog(m->event, true);
648 
649         r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
650         if (r < 0)
651                 return r;
652 
653         r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
654         if (r < 0)
655                 return r;
656 
657         m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
658         if (!m->monitored_swap_cgroup_contexts)
659                 return -ENOMEM;
660 
661         m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
662         if (!m->monitored_mem_pressure_cgroup_contexts)
663                 return -ENOMEM;
664 
665         m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
666         if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
667                 return -ENOMEM;
668 
669         *ret = TAKE_PTR(m);
670         return 0;
671 }
672 
manager_connect_bus(Manager * m)673 static int manager_connect_bus(Manager *m) {
674         int r;
675 
676         assert(m);
677         assert(!m->bus);
678 
679         r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
680         if (r < 0)
681                 return log_error_errno(r, "Failed to connect to bus: %m");
682 
683         r = bus_add_implementation(m->bus, &manager_object, m);
684         if (r < 0)
685                 return r;
686 
687         r = bus_log_control_api_register(m->bus);
688         if (r < 0)
689                 return r;
690 
691         r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
692         if (r < 0)
693                 return log_error_errno(r, "Failed to request name: %m");
694 
695         r = sd_bus_attach_event(m->bus, m->event, 0);
696         if (r < 0)
697                 return log_error_errno(r, "Failed to attach bus to event loop: %m");
698 
699         return 0;
700 }
701 
manager_varlink_init(Manager * m,int fd)702 static int manager_varlink_init(Manager *m, int fd) {
703         _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
704         int r;
705 
706         assert(m);
707         assert(!m->varlink_server);
708 
709         r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
710         if (r < 0)
711                 return log_error_errno(r, "Failed to allocate varlink server object: %m");
712 
713         varlink_server_set_userdata(s, m);
714 
715         r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request);
716         if (r < 0)
717                 return log_error_errno(r, "Failed to register varlink method: %m");
718 
719         if (fd < 0)
720                 r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666);
721         else
722                 r = varlink_server_listen_fd(s, fd);
723         if (r < 0)
724                 return log_error_errno(r, "Failed to bind to varlink socket: %m");
725 
726         r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
727         if (r < 0)
728                 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
729 
730         log_debug("Initialized systemd-oomd varlink server");
731 
732         m->varlink_server = TAKE_PTR(s);
733         return 0;
734 }
735 
manager_start(Manager * m,bool dry_run,int swap_used_limit_permyriad,int mem_pressure_limit_permyriad,usec_t mem_pressure_usec,int fd)736 int manager_start(
737                 Manager *m,
738                 bool dry_run,
739                 int swap_used_limit_permyriad,
740                 int mem_pressure_limit_permyriad,
741                 usec_t mem_pressure_usec,
742                 int fd) {
743 
744         unsigned long l, f;
745         int r;
746 
747         assert(m);
748 
749         m->dry_run = dry_run;
750 
751         m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
752         assert(m->swap_used_limit_permyriad <= 10000);
753 
754         if (mem_pressure_limit_permyriad >= 0) {
755                 assert(mem_pressure_limit_permyriad <= 10000);
756 
757                 l = mem_pressure_limit_permyriad / 100;
758                 f = mem_pressure_limit_permyriad % 100;
759         } else {
760                 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
761                 f = 0;
762         }
763         r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
764         if (r < 0)
765                 return r;
766 
767         m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
768 
769         r = manager_connect_bus(m);
770         if (r < 0)
771                 return r;
772 
773         r = acquire_managed_oom_connect(m);
774         if (r < 0)
775                 return r;
776 
777         r = manager_varlink_init(m, fd);
778         if (r < 0)
779                 return r;
780 
781         r = monitor_memory_pressure_contexts(m);
782         if (r < 0)
783                 return r;
784 
785         r = monitor_swap_contexts(m);
786         if (r < 0)
787                 return r;
788 
789         return 0;
790 }
791 
manager_get_dump_string(Manager * m,char ** ret)792 int manager_get_dump_string(Manager *m, char **ret) {
793         _cleanup_free_ char *dump = NULL;
794         _cleanup_fclose_ FILE *f = NULL;
795         OomdCGroupContext *c;
796         size_t size;
797         char *key;
798         int r;
799 
800         assert(m);
801         assert(ret);
802 
803         f = open_memstream_unlocked(&dump, &size);
804         if (!f)
805                 return -errno;
806 
807         fprintf(f,
808                 "Dry Run: %s\n"
809                 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
810                 "Default Memory Pressure Limit: %lu.%02lu%%\n"
811                 "Default Memory Pressure Duration: %s\n"
812                 "System Context:\n",
813                 yes_no(m->dry_run),
814                 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
815                 LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit),
816                 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
817         oomd_dump_system_context(&m->system_context, f, "\t");
818 
819         fprintf(f, "Swap Monitored CGroups:\n");
820         HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
821                 oomd_dump_swap_cgroup_context(c, f, "\t");
822 
823         fprintf(f, "Memory Pressure Monitored CGroups:\n");
824         HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
825                 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
826 
827         r = fflush_and_check(f);
828         if (r < 0)
829                 return r;
830 
831         f = safe_fclose(f);
832 
833         *ret = TAKE_PTR(dump);
834         return 0;
835 }
836