1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include "sd-daemon.h"
4
5 #include "bus-log-control-api.h"
6 #include "bus-util.h"
7 #include "bus-polkit.h"
8 #include "cgroup-util.h"
9 #include "fd-util.h"
10 #include "fileio.h"
11 #include "format-util.h"
12 #include "memory-util.h"
13 #include "oomd-manager-bus.h"
14 #include "oomd-manager.h"
15 #include "path-util.h"
16 #include "percent-util.h"
17
18 typedef struct ManagedOOMMessage {
19 ManagedOOMMode mode;
20 char *path;
21 char *property;
22 uint32_t limit;
23 } ManagedOOMMessage;
24
managed_oom_message_destroy(ManagedOOMMessage * message)25 static void managed_oom_message_destroy(ManagedOOMMessage *message) {
26 assert(message);
27 free(message->path);
28 free(message->property);
29 }
30
managed_oom_mode(const char * name,JsonVariant * v,JsonDispatchFlags flags,void * userdata)31 static int managed_oom_mode(const char *name, JsonVariant *v, JsonDispatchFlags flags, void *userdata) {
32 ManagedOOMMode *mode = userdata, m;
33 const char *s;
34
35 assert(mode);
36 assert_se(s = json_variant_string(v));
37
38 m = managed_oom_mode_from_string(s);
39 if (m < 0)
40 return json_log(v, flags, m, "%s is not a valid ManagedOOMMode", s);
41
42 *mode = m;
43 return 0;
44 }
45
process_managed_oom_message(Manager * m,uid_t uid,JsonVariant * parameters)46 static int process_managed_oom_message(Manager *m, uid_t uid, JsonVariant *parameters) {
47 JsonVariant *c, *cgroups;
48 int r;
49
50 static const JsonDispatch dispatch_table[] = {
51 { "mode", JSON_VARIANT_STRING, managed_oom_mode, offsetof(ManagedOOMMessage, mode), JSON_MANDATORY },
52 { "path", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, path), JSON_MANDATORY },
53 { "property", JSON_VARIANT_STRING, json_dispatch_string, offsetof(ManagedOOMMessage, property), JSON_MANDATORY },
54 { "limit", JSON_VARIANT_UNSIGNED, json_dispatch_uint32, offsetof(ManagedOOMMessage, limit), 0 },
55 {},
56 };
57
58 assert(m);
59 assert(parameters);
60
61 cgroups = json_variant_by_key(parameters, "cgroups");
62 if (!cgroups)
63 return -EINVAL;
64
65 /* Skip malformed elements and keep processing in case the others are good */
66 JSON_VARIANT_ARRAY_FOREACH(c, cgroups) {
67 _cleanup_(managed_oom_message_destroy) ManagedOOMMessage message = {};
68 OomdCGroupContext *ctx;
69 Hashmap *monitor_hm;
70 loadavg_t limit;
71
72 if (!json_variant_is_object(c))
73 continue;
74
75 r = json_dispatch(c, dispatch_table, NULL, 0, &message);
76 if (r == -ENOMEM)
77 return r;
78 if (r < 0)
79 continue;
80
81 if (uid != 0) {
82 uid_t cg_uid;
83
84 r = cg_path_get_owner_uid(message.path, &cg_uid);
85 if (r < 0) {
86 log_debug("Failed to get cgroup %s owner uid: %m", message.path);
87 continue;
88 }
89
90 /* Let's not be lenient for permission errors and skip processing if we receive an
91 * update for a cgroup that doesn't belong to the user. */
92 if (uid != cg_uid)
93 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
94 "cgroup path owner UID does not match sender uid "
95 "(" UID_FMT " != " UID_FMT ")", uid, cg_uid);
96 }
97
98 monitor_hm = streq(message.property, "ManagedOOMSwap") ?
99 m->monitored_swap_cgroup_contexts : m->monitored_mem_pressure_cgroup_contexts;
100
101 if (message.mode == MANAGED_OOM_AUTO) {
102 (void) oomd_cgroup_context_free(hashmap_remove(monitor_hm, empty_to_root(message.path)));
103 continue;
104 }
105
106 limit = m->default_mem_pressure_limit;
107
108 if (streq(message.property, "ManagedOOMMemoryPressure") && message.limit > 0) {
109 int permyriad = UINT32_SCALE_TO_PERMYRIAD(message.limit);
110
111 r = store_loadavg_fixed_point(permyriad / 100LU, permyriad % 100LU, &limit);
112 if (r < 0)
113 continue;
114 }
115
116 r = oomd_insert_cgroup_context(NULL, monitor_hm, message.path);
117 if (r == -ENOMEM)
118 return r;
119 if (r < 0 && r != -EEXIST)
120 log_debug_errno(r, "Failed to insert message, ignoring: %m");
121
122 /* Always update the limit in case it was changed. For non-memory pressure detection the value is
123 * ignored so always updating it here is not a problem. */
124 ctx = hashmap_get(monitor_hm, empty_to_root(message.path));
125 if (ctx)
126 ctx->mem_pressure_limit = limit;
127 }
128
129 return 0;
130 }
131
process_managed_oom_request(Varlink * link,JsonVariant * parameters,VarlinkMethodFlags flags,void * userdata)132 static int process_managed_oom_request(
133 Varlink *link,
134 JsonVariant *parameters,
135 VarlinkMethodFlags flags,
136 void *userdata) {
137 Manager *m = userdata;
138 uid_t uid;
139 int r;
140
141 assert(m);
142
143 r = varlink_get_peer_uid(link, &uid);
144 if (r < 0)
145 return log_error_errno(r, "Failed to get varlink peer uid: %m");
146
147 return process_managed_oom_message(m, uid, parameters);
148 }
149
process_managed_oom_reply(Varlink * link,JsonVariant * parameters,const char * error_id,VarlinkReplyFlags flags,void * userdata)150 static int process_managed_oom_reply(
151 Varlink *link,
152 JsonVariant *parameters,
153 const char *error_id,
154 VarlinkReplyFlags flags,
155 void *userdata) {
156 Manager *m = userdata;
157 uid_t uid;
158 int r;
159
160 assert(m);
161
162 if (error_id) {
163 r = -EIO;
164 log_debug("Error getting ManagedOOM cgroups: %s", error_id);
165 goto finish;
166 }
167
168 r = varlink_get_peer_uid(link, &uid);
169 if (r < 0) {
170 log_error_errno(r, "Failed to get varlink peer uid: %m");
171 goto finish;
172 }
173
174 r = process_managed_oom_message(m, uid, parameters);
175
176 finish:
177 if (!FLAGS_SET(flags, VARLINK_REPLY_CONTINUES))
178 m->varlink_client = varlink_close_unref(link);
179
180 return r;
181 }
182
183 /* Fill 'new_h' with 'path's descendant OomdCGroupContexts. Only include descendant cgroups that are possible
184 * candidates for action. That is, only leaf cgroups or cgroups with memory.oom.group set to "1".
185 *
186 * This function ignores most errors in order to handle cgroups that may have been cleaned up while
187 * populating the hashmap.
188 *
189 * 'new_h' is of the form { key: cgroup paths -> value: OomdCGroupContext } */
recursively_get_cgroup_context(Hashmap * new_h,const char * path)190 static int recursively_get_cgroup_context(Hashmap *new_h, const char *path) {
191 _cleanup_free_ char *subpath = NULL;
192 _cleanup_closedir_ DIR *d = NULL;
193 int r;
194
195 assert(new_h);
196 assert(path);
197
198 r = cg_enumerate_subgroups(SYSTEMD_CGROUP_CONTROLLER, path, &d);
199 if (r < 0)
200 return r;
201
202 r = cg_read_subgroup(d, &subpath);
203 if (r < 0)
204 return r;
205 else if (r == 0) { /* No subgroups? We're a leaf node */
206 r = oomd_insert_cgroup_context(NULL, new_h, path);
207 if (r == -ENOMEM)
208 return r;
209 if (r < 0)
210 log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", path);
211 return 0;
212 }
213
214 do {
215 _cleanup_free_ char *cg_path = NULL;
216 bool oom_group;
217
218 cg_path = path_join(empty_to_root(path), subpath);
219 if (!cg_path)
220 return -ENOMEM;
221
222 subpath = mfree(subpath);
223
224 r = cg_get_attribute_as_bool("memory", cg_path, "memory.oom.group", &oom_group);
225 /* The cgroup might be gone. Skip it as a candidate since we can't get information on it. */
226 if (r == -ENOMEM)
227 return r;
228 if (r < 0) {
229 log_debug_errno(r, "Failed to read memory.oom.group from %s, ignoring: %m", cg_path);
230 return 0;
231 }
232
233 if (oom_group)
234 r = oomd_insert_cgroup_context(NULL, new_h, cg_path);
235 else
236 r = recursively_get_cgroup_context(new_h, cg_path);
237 if (r == -ENOMEM)
238 return r;
239 if (r < 0)
240 log_debug_errno(r, "Failed to insert or recursively get from %s, ignoring: %m", cg_path);
241 } while ((r = cg_read_subgroup(d, &subpath)) > 0);
242
243 return 0;
244 }
245
update_monitored_cgroup_contexts(Hashmap ** monitored_cgroups)246 static int update_monitored_cgroup_contexts(Hashmap **monitored_cgroups) {
247 _cleanup_hashmap_free_ Hashmap *new_base = NULL;
248 OomdCGroupContext *ctx;
249 int r;
250
251 assert(monitored_cgroups);
252
253 new_base = hashmap_new(&oomd_cgroup_ctx_hash_ops);
254 if (!new_base)
255 return -ENOMEM;
256
257 HASHMAP_FOREACH(ctx, *monitored_cgroups) {
258 /* Skip most errors since the cgroup we're trying to update might not exist anymore. */
259 r = oomd_insert_cgroup_context(*monitored_cgroups, new_base, ctx->path);
260 if (r == -ENOMEM)
261 return r;
262 if (r < 0 && !IN_SET(r, -EEXIST, -ENOENT))
263 log_debug_errno(r, "Failed to insert context for %s, ignoring: %m", ctx->path);
264 }
265
266 hashmap_free(*monitored_cgroups);
267 *monitored_cgroups = TAKE_PTR(new_base);
268
269 return 0;
270 }
271
get_monitored_cgroup_contexts_candidates(Hashmap * monitored_cgroups,Hashmap ** ret_candidates)272 static int get_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **ret_candidates) {
273 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
274 OomdCGroupContext *ctx;
275 int r;
276
277 assert(monitored_cgroups);
278 assert(ret_candidates);
279
280 candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
281 if (!candidates)
282 return -ENOMEM;
283
284 HASHMAP_FOREACH(ctx, monitored_cgroups) {
285 r = recursively_get_cgroup_context(candidates, ctx->path);
286 if (r == -ENOMEM)
287 return r;
288 if (r < 0)
289 log_debug_errno(r, "Failed to recursively get contexts for %s, ignoring: %m", ctx->path);
290 }
291
292 *ret_candidates = TAKE_PTR(candidates);
293
294 return 0;
295 }
296
update_monitored_cgroup_contexts_candidates(Hashmap * monitored_cgroups,Hashmap ** candidates)297 static int update_monitored_cgroup_contexts_candidates(Hashmap *monitored_cgroups, Hashmap **candidates) {
298 _cleanup_hashmap_free_ Hashmap *new_candidates = NULL;
299 int r;
300
301 assert(monitored_cgroups);
302 assert(candidates);
303 assert(*candidates);
304
305 r = get_monitored_cgroup_contexts_candidates(monitored_cgroups, &new_candidates);
306 if (r < 0)
307 return log_debug_errno(r, "Failed to get candidate contexts: %m");
308
309 oomd_update_cgroup_contexts_between_hashmaps(*candidates, new_candidates);
310
311 hashmap_free(*candidates);
312 *candidates = TAKE_PTR(new_candidates);
313
314 return 0;
315 }
316
acquire_managed_oom_connect(Manager * m)317 static int acquire_managed_oom_connect(Manager *m) {
318 _cleanup_(varlink_close_unrefp) Varlink *link = NULL;
319 int r;
320
321 assert(m);
322 assert(m->event);
323
324 r = varlink_connect_address(&link, VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM);
325 if (r < 0)
326 return log_error_errno(r, "Failed to connect to " VARLINK_ADDR_PATH_MANAGED_OOM_SYSTEM ": %m");
327
328 (void) varlink_set_userdata(link, m);
329 (void) varlink_set_description(link, "oomd");
330 (void) varlink_set_relative_timeout(link, USEC_INFINITY);
331
332 r = varlink_attach_event(link, m->event, SD_EVENT_PRIORITY_NORMAL);
333 if (r < 0)
334 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
335
336 r = varlink_bind_reply(link, process_managed_oom_reply);
337 if (r < 0)
338 return log_error_errno(r, "Failed to bind reply callback: %m");
339
340 r = varlink_observe(link, "io.systemd.ManagedOOM.SubscribeManagedOOMCGroups", NULL);
341 if (r < 0)
342 return log_error_errno(r, "Failed to observe varlink call: %m");
343
344 m->varlink_client = TAKE_PTR(link);
345 return 0;
346 }
347
monitor_swap_contexts_handler(sd_event_source * s,uint64_t usec,void * userdata)348 static int monitor_swap_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
349 Manager *m = userdata;
350 usec_t usec_now;
351 int r;
352
353 assert(s);
354 assert(userdata);
355
356 /* Reset timer */
357 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
358 if (r < 0)
359 return log_error_errno(r, "Failed to reset event timer: %m");
360
361 r = sd_event_source_set_time_relative(s, SWAP_INTERVAL_USEC);
362 if (r < 0)
363 return log_error_errno(r, "Failed to set relative time for timer: %m");
364
365 /* Reconnect if our connection dropped */
366 if (!m->varlink_client) {
367 r = acquire_managed_oom_connect(m);
368 if (r < 0)
369 return log_error_errno(r, "Failed to acquire varlink connection: %m");
370 }
371
372 /* We still try to acquire system information for oomctl even if no units want swap monitoring */
373 r = oomd_system_context_acquire("/proc/meminfo", &m->system_context);
374 /* If there are no units depending on swap actions, the only error we exit on is ENOMEM. */
375 if (r == -ENOMEM || (r < 0 && !hashmap_isempty(m->monitored_swap_cgroup_contexts)))
376 return log_error_errno(r, "Failed to acquire system context: %m");
377
378 /* Return early if nothing is requesting swap monitoring */
379 if (hashmap_isempty(m->monitored_swap_cgroup_contexts))
380 return 0;
381
382 /* Note that m->monitored_swap_cgroup_contexts does not need to be updated every interval because only the
383 * system context is used for deciding whether the swap threshold is hit. m->monitored_swap_cgroup_contexts
384 * is only used to decide which cgroups to kill (and even then only the resource usages of its descendent
385 * nodes are the ones that matter). */
386
387 /* Check amount of memory available and swap free so we don't free up swap when memory is still available. */
388 if (oomd_mem_available_below(&m->system_context, 10000 - m->swap_used_limit_permyriad) &&
389 oomd_swap_free_below(&m->system_context, 10000 - m->swap_used_limit_permyriad)) {
390 _cleanup_hashmap_free_ Hashmap *candidates = NULL;
391 _cleanup_free_ char *selected = NULL;
392 uint64_t threshold;
393
394 log_debug("Memory used (%"PRIu64") / total (%"PRIu64") and "
395 "swap used (%"PRIu64") / total (%"PRIu64") is more than " PERMYRIAD_AS_PERCENT_FORMAT_STR,
396 m->system_context.mem_used, m->system_context.mem_total,
397 m->system_context.swap_used, m->system_context.swap_total,
398 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
399
400 r = get_monitored_cgroup_contexts_candidates(m->monitored_swap_cgroup_contexts, &candidates);
401 if (r == -ENOMEM)
402 return log_oom();
403 if (r < 0)
404 log_debug_errno(r, "Failed to get monitored swap cgroup candidates, ignoring: %m");
405
406 threshold = m->system_context.swap_total * THRESHOLD_SWAP_USED_PERCENT / 100;
407 r = oomd_kill_by_swap_usage(candidates, threshold, m->dry_run, &selected);
408 if (r == -ENOMEM)
409 return log_oom();
410 if (r < 0)
411 log_notice_errno(r, "Failed to kill any cgroup(s) based on swap: %m");
412 else {
413 if (selected && r > 0)
414 log_notice("Killed %s due to memory used (%"PRIu64") / total (%"PRIu64") and "
415 "swap used (%"PRIu64") / total (%"PRIu64") being more than "
416 PERMYRIAD_AS_PERCENT_FORMAT_STR,
417 selected,
418 m->system_context.mem_used, m->system_context.mem_total,
419 m->system_context.swap_used, m->system_context.swap_total,
420 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad));
421 return 0;
422 }
423 }
424
425 return 0;
426 }
427
clear_candidate_hashmapp(Manager ** m)428 static void clear_candidate_hashmapp(Manager **m) {
429 if (*m)
430 hashmap_clear((*m)->monitored_mem_pressure_cgroup_contexts_candidates);
431 }
432
monitor_memory_pressure_contexts_handler(sd_event_source * s,uint64_t usec,void * userdata)433 static int monitor_memory_pressure_contexts_handler(sd_event_source *s, uint64_t usec, void *userdata) {
434 /* Don't want to use stale candidate data. Setting this will clear the candidate hashmap on return unless we
435 * update the candidate data (in which case clear_candidates will be NULL). */
436 _unused_ _cleanup_(clear_candidate_hashmapp) Manager *clear_candidates = userdata;
437 _cleanup_set_free_ Set *targets = NULL;
438 bool in_post_action_delay = false;
439 Manager *m = userdata;
440 usec_t usec_now;
441 int r;
442
443 assert(s);
444 assert(userdata);
445
446 /* Reset timer */
447 r = sd_event_now(sd_event_source_get_event(s), CLOCK_MONOTONIC, &usec_now);
448 if (r < 0)
449 return log_error_errno(r, "Failed to reset event timer: %m");
450
451 r = sd_event_source_set_time_relative(s, MEM_PRESSURE_INTERVAL_USEC);
452 if (r < 0)
453 return log_error_errno(r, "Failed to set relative time for timer: %m");
454
455 /* Reconnect if our connection dropped */
456 if (!m->varlink_client) {
457 r = acquire_managed_oom_connect(m);
458 if (r < 0)
459 return log_error_errno(r, "Failed to acquire varlink connection: %m");
460 }
461
462 /* Return early if nothing is requesting memory pressure monitoring */
463 if (hashmap_isempty(m->monitored_mem_pressure_cgroup_contexts))
464 return 0;
465
466 /* Update the cgroups used for detection/action */
467 r = update_monitored_cgroup_contexts(&m->monitored_mem_pressure_cgroup_contexts);
468 if (r == -ENOMEM)
469 return log_oom();
470 if (r < 0)
471 log_debug_errno(r, "Failed to update monitored memory pressure cgroup contexts, ignoring: %m");
472
473 /* Since pressure counters are lagging, we need to wait a bit after a kill to ensure we don't read stale
474 * values and go on a kill storm. */
475 if (m->mem_pressure_post_action_delay_start > 0) {
476 if (m->mem_pressure_post_action_delay_start + POST_ACTION_DELAY_USEC > usec_now)
477 in_post_action_delay = true;
478 else
479 m->mem_pressure_post_action_delay_start = 0;
480 }
481
482 r = oomd_pressure_above(m->monitored_mem_pressure_cgroup_contexts, m->default_mem_pressure_duration_usec, &targets);
483 if (r == -ENOMEM)
484 return log_oom();
485 if (r < 0)
486 log_debug_errno(r, "Failed to check if memory pressure exceeded limits, ignoring: %m");
487 else if (r == 1 && !in_post_action_delay) {
488 OomdCGroupContext *t;
489 SET_FOREACH(t, targets) {
490 _cleanup_free_ char *selected = NULL;
491
492 /* Check if there was reclaim activity in the given interval. The concern is the following case:
493 * Pressure climbed, a lot of high-frequency pages were reclaimed, and we killed the offending
494 * cgroup. Even after this, well-behaved processes will fault in recently resident pages and
495 * this will cause pressure to remain high. Thus if there isn't any reclaim pressure, no need
496 * to kill something (it won't help anyways). */
497 if ((now(CLOCK_MONOTONIC) - t->last_had_mem_reclaim) > RECLAIM_DURATION_USEC)
498 continue;
499
500 log_debug("Memory pressure for %s is %lu.%02lu%% > %lu.%02lu%% for > %s with reclaim activity",
501 t->path,
502 LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
503 LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
504 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
505
506 r = update_monitored_cgroup_contexts_candidates(
507 m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
508 if (r == -ENOMEM)
509 return log_oom();
510 if (r < 0)
511 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
512 else
513 clear_candidates = NULL;
514
515 r = oomd_kill_by_pgscan_rate(m->monitored_mem_pressure_cgroup_contexts_candidates, t->path, m->dry_run, &selected);
516 if (r == -ENOMEM)
517 return log_oom();
518 if (r < 0)
519 log_notice_errno(r, "Failed to kill any cgroup(s) under %s based on pressure: %m", t->path);
520 else {
521 /* Don't act on all the high pressure cgroups at once; return as soon as we kill one.
522 * If r == 0 then it means there were not eligible candidates, the candidate cgroup
523 * disappeared, or the candidate cgroup has no processes by the time we tried to kill
524 * it. In either case, go through the event loop again and select a new candidate if
525 * pressure is still high. */
526 m->mem_pressure_post_action_delay_start = usec_now;
527 if (selected && r > 0)
528 log_notice("Killed %s due to memory pressure for %s being %lu.%02lu%% > %lu.%02lu%%"
529 " for > %s with reclaim activity",
530 selected, t->path,
531 LOADAVG_INT_SIDE(t->memory_pressure.avg10), LOADAVG_DECIMAL_SIDE(t->memory_pressure.avg10),
532 LOADAVG_INT_SIDE(t->mem_pressure_limit), LOADAVG_DECIMAL_SIDE(t->mem_pressure_limit),
533 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
534 return 0;
535 }
536 }
537 } else {
538 /* If any monitored cgroup is over their pressure limit, get all the kill candidates for every
539 * monitored cgroup. This saves CPU cycles from doing it every interval by only doing it when a kill
540 * might happen.
541 * Candidate cgroup data will continue to get updated during the post-action delay period in case
542 * pressure continues to be high after a kill. */
543 OomdCGroupContext *c;
544 HASHMAP_FOREACH(c, m->monitored_mem_pressure_cgroup_contexts) {
545 if (c->mem_pressure_limit_hit_start == 0)
546 continue;
547
548 r = update_monitored_cgroup_contexts_candidates(
549 m->monitored_mem_pressure_cgroup_contexts, &m->monitored_mem_pressure_cgroup_contexts_candidates);
550 if (r == -ENOMEM)
551 return log_oom();
552 if (r < 0)
553 log_debug_errno(r, "Failed to update monitored memory pressure candidate cgroup contexts, ignoring: %m");
554 else {
555 clear_candidates = NULL;
556 break;
557 }
558 }
559 }
560
561 return 0;
562 }
563
monitor_swap_contexts(Manager * m)564 static int monitor_swap_contexts(Manager *m) {
565 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
566 int r;
567
568 assert(m);
569 assert(m->event);
570
571 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_swap_contexts_handler, m);
572 if (r < 0)
573 return r;
574
575 r = sd_event_source_set_exit_on_failure(s, true);
576 if (r < 0)
577 return r;
578
579 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
580 if (r < 0)
581 return r;
582
583 (void) sd_event_source_set_description(s, "oomd-swap-timer");
584
585 m->swap_context_event_source = TAKE_PTR(s);
586 return 0;
587 }
588
monitor_memory_pressure_contexts(Manager * m)589 static int monitor_memory_pressure_contexts(Manager *m) {
590 _cleanup_(sd_event_source_unrefp) sd_event_source *s = NULL;
591 int r;
592
593 assert(m);
594 assert(m->event);
595
596 r = sd_event_add_time(m->event, &s, CLOCK_MONOTONIC, 0, 0, monitor_memory_pressure_contexts_handler, m);
597 if (r < 0)
598 return r;
599
600 r = sd_event_source_set_exit_on_failure(s, true);
601 if (r < 0)
602 return r;
603
604 r = sd_event_source_set_enabled(s, SD_EVENT_ON);
605 if (r < 0)
606 return r;
607
608 (void) sd_event_source_set_description(s, "oomd-memory-pressure-timer");
609
610 m->mem_pressure_context_event_source = TAKE_PTR(s);
611 return 0;
612 }
613
manager_free(Manager * m)614 Manager* manager_free(Manager *m) {
615 assert(m);
616
617 varlink_server_unref(m->varlink_server);
618 varlink_close_unref(m->varlink_client);
619 sd_event_source_unref(m->swap_context_event_source);
620 sd_event_source_unref(m->mem_pressure_context_event_source);
621 sd_event_unref(m->event);
622
623 bus_verify_polkit_async_registry_free(m->polkit_registry);
624 sd_bus_flush_close_unref(m->bus);
625
626 hashmap_free(m->monitored_swap_cgroup_contexts);
627 hashmap_free(m->monitored_mem_pressure_cgroup_contexts);
628 hashmap_free(m->monitored_mem_pressure_cgroup_contexts_candidates);
629
630 return mfree(m);
631 }
632
manager_new(Manager ** ret)633 int manager_new(Manager **ret) {
634 _cleanup_(manager_freep) Manager *m = NULL;
635 int r;
636
637 assert(ret);
638
639 m = new0(Manager, 1);
640 if (!m)
641 return -ENOMEM;
642
643 r = sd_event_default(&m->event);
644 if (r < 0)
645 return r;
646
647 (void) sd_event_set_watchdog(m->event, true);
648
649 r = sd_event_add_signal(m->event, NULL, SIGINT, NULL, NULL);
650 if (r < 0)
651 return r;
652
653 r = sd_event_add_signal(m->event, NULL, SIGTERM, NULL, NULL);
654 if (r < 0)
655 return r;
656
657 m->monitored_swap_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
658 if (!m->monitored_swap_cgroup_contexts)
659 return -ENOMEM;
660
661 m->monitored_mem_pressure_cgroup_contexts = hashmap_new(&oomd_cgroup_ctx_hash_ops);
662 if (!m->monitored_mem_pressure_cgroup_contexts)
663 return -ENOMEM;
664
665 m->monitored_mem_pressure_cgroup_contexts_candidates = hashmap_new(&oomd_cgroup_ctx_hash_ops);
666 if (!m->monitored_mem_pressure_cgroup_contexts_candidates)
667 return -ENOMEM;
668
669 *ret = TAKE_PTR(m);
670 return 0;
671 }
672
manager_connect_bus(Manager * m)673 static int manager_connect_bus(Manager *m) {
674 int r;
675
676 assert(m);
677 assert(!m->bus);
678
679 r = bus_open_system_watch_bind_with_description(&m->bus, "bus-api-oom");
680 if (r < 0)
681 return log_error_errno(r, "Failed to connect to bus: %m");
682
683 r = bus_add_implementation(m->bus, &manager_object, m);
684 if (r < 0)
685 return r;
686
687 r = bus_log_control_api_register(m->bus);
688 if (r < 0)
689 return r;
690
691 r = sd_bus_request_name_async(m->bus, NULL, "org.freedesktop.oom1", 0, NULL, NULL);
692 if (r < 0)
693 return log_error_errno(r, "Failed to request name: %m");
694
695 r = sd_bus_attach_event(m->bus, m->event, 0);
696 if (r < 0)
697 return log_error_errno(r, "Failed to attach bus to event loop: %m");
698
699 return 0;
700 }
701
manager_varlink_init(Manager * m,int fd)702 static int manager_varlink_init(Manager *m, int fd) {
703 _cleanup_(varlink_server_unrefp) VarlinkServer *s = NULL;
704 int r;
705
706 assert(m);
707 assert(!m->varlink_server);
708
709 r = varlink_server_new(&s, VARLINK_SERVER_ACCOUNT_UID|VARLINK_SERVER_INHERIT_USERDATA);
710 if (r < 0)
711 return log_error_errno(r, "Failed to allocate varlink server object: %m");
712
713 varlink_server_set_userdata(s, m);
714
715 r = varlink_server_bind_method(s, "io.systemd.oom.ReportManagedOOMCGroups", process_managed_oom_request);
716 if (r < 0)
717 return log_error_errno(r, "Failed to register varlink method: %m");
718
719 if (fd < 0)
720 r = varlink_server_listen_address(s, VARLINK_ADDR_PATH_MANAGED_OOM_USER, 0666);
721 else
722 r = varlink_server_listen_fd(s, fd);
723 if (r < 0)
724 return log_error_errno(r, "Failed to bind to varlink socket: %m");
725
726 r = varlink_server_attach_event(s, m->event, SD_EVENT_PRIORITY_NORMAL);
727 if (r < 0)
728 return log_error_errno(r, "Failed to attach varlink connection to event loop: %m");
729
730 log_debug("Initialized systemd-oomd varlink server");
731
732 m->varlink_server = TAKE_PTR(s);
733 return 0;
734 }
735
manager_start(Manager * m,bool dry_run,int swap_used_limit_permyriad,int mem_pressure_limit_permyriad,usec_t mem_pressure_usec,int fd)736 int manager_start(
737 Manager *m,
738 bool dry_run,
739 int swap_used_limit_permyriad,
740 int mem_pressure_limit_permyriad,
741 usec_t mem_pressure_usec,
742 int fd) {
743
744 unsigned long l, f;
745 int r;
746
747 assert(m);
748
749 m->dry_run = dry_run;
750
751 m->swap_used_limit_permyriad = swap_used_limit_permyriad >= 0 ? swap_used_limit_permyriad : DEFAULT_SWAP_USED_LIMIT_PERCENT * 100;
752 assert(m->swap_used_limit_permyriad <= 10000);
753
754 if (mem_pressure_limit_permyriad >= 0) {
755 assert(mem_pressure_limit_permyriad <= 10000);
756
757 l = mem_pressure_limit_permyriad / 100;
758 f = mem_pressure_limit_permyriad % 100;
759 } else {
760 l = DEFAULT_MEM_PRESSURE_LIMIT_PERCENT;
761 f = 0;
762 }
763 r = store_loadavg_fixed_point(l, f, &m->default_mem_pressure_limit);
764 if (r < 0)
765 return r;
766
767 m->default_mem_pressure_duration_usec = mem_pressure_usec ?: DEFAULT_MEM_PRESSURE_DURATION_USEC;
768
769 r = manager_connect_bus(m);
770 if (r < 0)
771 return r;
772
773 r = acquire_managed_oom_connect(m);
774 if (r < 0)
775 return r;
776
777 r = manager_varlink_init(m, fd);
778 if (r < 0)
779 return r;
780
781 r = monitor_memory_pressure_contexts(m);
782 if (r < 0)
783 return r;
784
785 r = monitor_swap_contexts(m);
786 if (r < 0)
787 return r;
788
789 return 0;
790 }
791
manager_get_dump_string(Manager * m,char ** ret)792 int manager_get_dump_string(Manager *m, char **ret) {
793 _cleanup_free_ char *dump = NULL;
794 _cleanup_fclose_ FILE *f = NULL;
795 OomdCGroupContext *c;
796 size_t size;
797 char *key;
798 int r;
799
800 assert(m);
801 assert(ret);
802
803 f = open_memstream_unlocked(&dump, &size);
804 if (!f)
805 return -errno;
806
807 fprintf(f,
808 "Dry Run: %s\n"
809 "Swap Used Limit: " PERMYRIAD_AS_PERCENT_FORMAT_STR "\n"
810 "Default Memory Pressure Limit: %lu.%02lu%%\n"
811 "Default Memory Pressure Duration: %s\n"
812 "System Context:\n",
813 yes_no(m->dry_run),
814 PERMYRIAD_AS_PERCENT_FORMAT_VAL(m->swap_used_limit_permyriad),
815 LOADAVG_INT_SIDE(m->default_mem_pressure_limit), LOADAVG_DECIMAL_SIDE(m->default_mem_pressure_limit),
816 FORMAT_TIMESPAN(m->default_mem_pressure_duration_usec, USEC_PER_SEC));
817 oomd_dump_system_context(&m->system_context, f, "\t");
818
819 fprintf(f, "Swap Monitored CGroups:\n");
820 HASHMAP_FOREACH_KEY(c, key, m->monitored_swap_cgroup_contexts)
821 oomd_dump_swap_cgroup_context(c, f, "\t");
822
823 fprintf(f, "Memory Pressure Monitored CGroups:\n");
824 HASHMAP_FOREACH_KEY(c, key, m->monitored_mem_pressure_cgroup_contexts)
825 oomd_dump_memory_pressure_cgroup_context(c, f, "\t");
826
827 r = fflush_and_check(f);
828 if (r < 0)
829 return r;
830
831 f = safe_fclose(f);
832
833 *ret = TAKE_PTR(dump);
834 return 0;
835 }
836