1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <sys/epoll.h>
4 #include <sys/timerfd.h>
5 #include <sys/wait.h>
6
7 #include "sd-daemon.h"
8 #include "sd-event.h"
9 #include "sd-id128.h"
10
11 #include "alloc-util.h"
12 #include "env-util.h"
13 #include "event-source.h"
14 #include "fd-util.h"
15 #include "fs-util.h"
16 #include "hashmap.h"
17 #include "list.h"
18 #include "macro.h"
19 #include "memory-util.h"
20 #include "missing_syscall.h"
21 #include "prioq.h"
22 #include "process-util.h"
23 #include "set.h"
24 #include "signal-util.h"
25 #include "string-table.h"
26 #include "string-util.h"
27 #include "strxcpyx.h"
28 #include "time-util.h"
29
30 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
31
EVENT_SOURCE_WATCH_PIDFD(sd_event_source * s)32 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
33 /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
34 return s &&
35 s->type == SOURCE_CHILD &&
36 s->child.pidfd >= 0 &&
37 s->child.options == WEXITED;
38 }
39
event_source_is_online(sd_event_source * s)40 static bool event_source_is_online(sd_event_source *s) {
41 assert(s);
42 return s->enabled != SD_EVENT_OFF && !s->ratelimited;
43 }
44
event_source_is_offline(sd_event_source * s)45 static bool event_source_is_offline(sd_event_source *s) {
46 assert(s);
47 return s->enabled == SD_EVENT_OFF || s->ratelimited;
48 }
49
50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
51 [SOURCE_IO] = "io",
52 [SOURCE_TIME_REALTIME] = "realtime",
53 [SOURCE_TIME_BOOTTIME] = "bootime",
54 [SOURCE_TIME_MONOTONIC] = "monotonic",
55 [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
56 [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
57 [SOURCE_SIGNAL] = "signal",
58 [SOURCE_CHILD] = "child",
59 [SOURCE_DEFER] = "defer",
60 [SOURCE_POST] = "post",
61 [SOURCE_EXIT] = "exit",
62 [SOURCE_WATCHDOG] = "watchdog",
63 [SOURCE_INOTIFY] = "inotify",
64 };
65
66 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
67
68 #define EVENT_SOURCE_IS_TIME(t) \
69 IN_SET((t), \
70 SOURCE_TIME_REALTIME, \
71 SOURCE_TIME_BOOTTIME, \
72 SOURCE_TIME_MONOTONIC, \
73 SOURCE_TIME_REALTIME_ALARM, \
74 SOURCE_TIME_BOOTTIME_ALARM)
75
76 #define EVENT_SOURCE_CAN_RATE_LIMIT(t) \
77 IN_SET((t), \
78 SOURCE_IO, \
79 SOURCE_TIME_REALTIME, \
80 SOURCE_TIME_BOOTTIME, \
81 SOURCE_TIME_MONOTONIC, \
82 SOURCE_TIME_REALTIME_ALARM, \
83 SOURCE_TIME_BOOTTIME_ALARM, \
84 SOURCE_SIGNAL, \
85 SOURCE_DEFER, \
86 SOURCE_INOTIFY)
87
88 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
89 * Time sources and ratelimited sources can be passed, so effectively this is the same as the
90 * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
91 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
92
93 struct sd_event {
94 unsigned n_ref;
95
96 int epoll_fd;
97 int watchdog_fd;
98
99 Prioq *pending;
100 Prioq *prepare;
101
102 /* timerfd_create() only supports these five clocks so far. We
103 * can add support for more clocks when the kernel learns to
104 * deal with them, too. */
105 struct clock_data realtime;
106 struct clock_data boottime;
107 struct clock_data monotonic;
108 struct clock_data realtime_alarm;
109 struct clock_data boottime_alarm;
110
111 usec_t perturb;
112
113 sd_event_source **signal_sources; /* indexed by signal number */
114 Hashmap *signal_data; /* indexed by priority */
115
116 Hashmap *child_sources;
117 unsigned n_online_child_sources;
118
119 Set *post_sources;
120
121 Prioq *exit;
122
123 Hashmap *inotify_data; /* indexed by priority */
124
125 /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
126 LIST_HEAD(struct inode_data, inode_data_to_close);
127
128 /* A list of inotify objects that already have events buffered which aren't processed yet */
129 LIST_HEAD(struct inotify_data, inotify_data_buffered);
130
131 pid_t original_pid;
132
133 uint64_t iteration;
134 triple_timestamp timestamp;
135 int state;
136
137 bool exit_requested:1;
138 bool need_process_child:1;
139 bool watchdog:1;
140 bool profile_delays:1;
141
142 int exit_code;
143
144 pid_t tid;
145 sd_event **default_event_ptr;
146
147 usec_t watchdog_last, watchdog_period;
148
149 unsigned n_sources;
150
151 struct epoll_event *event_queue;
152
153 LIST_HEAD(sd_event_source, sources);
154
155 usec_t last_run_usec, last_log_usec;
156 unsigned delays[sizeof(usec_t) * 8];
157 };
158
159 static thread_local sd_event *default_event = NULL;
160
161 static void source_disconnect(sd_event_source *s);
162 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
163
event_resolve(sd_event * e)164 static sd_event *event_resolve(sd_event *e) {
165 return e == SD_EVENT_DEFAULT ? default_event : e;
166 }
167
pending_prioq_compare(const void * a,const void * b)168 static int pending_prioq_compare(const void *a, const void *b) {
169 const sd_event_source *x = a, *y = b;
170 int r;
171
172 assert(x->pending);
173 assert(y->pending);
174
175 /* Enabled ones first */
176 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
177 if (r != 0)
178 return r;
179
180 /* Non rate-limited ones first. */
181 r = CMP(!!x->ratelimited, !!y->ratelimited);
182 if (r != 0)
183 return r;
184
185 /* Lower priority values first */
186 r = CMP(x->priority, y->priority);
187 if (r != 0)
188 return r;
189
190 /* Older entries first */
191 return CMP(x->pending_iteration, y->pending_iteration);
192 }
193
prepare_prioq_compare(const void * a,const void * b)194 static int prepare_prioq_compare(const void *a, const void *b) {
195 const sd_event_source *x = a, *y = b;
196 int r;
197
198 assert(x->prepare);
199 assert(y->prepare);
200
201 /* Enabled ones first */
202 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
203 if (r != 0)
204 return r;
205
206 /* Non rate-limited ones first. */
207 r = CMP(!!x->ratelimited, !!y->ratelimited);
208 if (r != 0)
209 return r;
210
211 /* Move most recently prepared ones last, so that we can stop
212 * preparing as soon as we hit one that has already been
213 * prepared in the current iteration */
214 r = CMP(x->prepare_iteration, y->prepare_iteration);
215 if (r != 0)
216 return r;
217
218 /* Lower priority values first */
219 return CMP(x->priority, y->priority);
220 }
221
time_event_source_next(const sd_event_source * s)222 static usec_t time_event_source_next(const sd_event_source *s) {
223 assert(s);
224
225 /* We have two kinds of event sources that have elapsation times associated with them: the actual
226 * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
227 * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
228 * looking at here. */
229
230 if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
231 assert(s->rate_limit.begin != 0);
232 assert(s->rate_limit.interval != 0);
233 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
234 }
235
236 /* Otherwise this must be a time event source, if not ratelimited */
237 if (EVENT_SOURCE_IS_TIME(s->type))
238 return s->time.next;
239
240 return USEC_INFINITY;
241 }
242
time_event_source_latest(const sd_event_source * s)243 static usec_t time_event_source_latest(const sd_event_source *s) {
244 assert(s);
245
246 if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
247 * same, as we should avoid adding additional inaccuracy on an inaccuracy time
248 * window */
249 assert(s->rate_limit.begin != 0);
250 assert(s->rate_limit.interval != 0);
251 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
252 }
253
254 /* Must be a time event source, if not ratelimited */
255 if (EVENT_SOURCE_IS_TIME(s->type))
256 return usec_add(s->time.next, s->time.accuracy);
257
258 return USEC_INFINITY;
259 }
260
event_source_timer_candidate(const sd_event_source * s)261 static bool event_source_timer_candidate(const sd_event_source *s) {
262 assert(s);
263
264 /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
265 * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
266 return !s->pending || s->ratelimited;
267 }
268
time_prioq_compare(const void * a,const void * b,usec_t (* time_func)(const sd_event_source * s))269 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
270 const sd_event_source *x = a, *y = b;
271 int r;
272
273 /* Enabled ones first */
274 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
275 if (r != 0)
276 return r;
277
278 /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
279 r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
280 if (r != 0)
281 return r;
282
283 /* Order by time */
284 return CMP(time_func(x), time_func(y));
285 }
286
earliest_time_prioq_compare(const void * a,const void * b)287 static int earliest_time_prioq_compare(const void *a, const void *b) {
288 return time_prioq_compare(a, b, time_event_source_next);
289 }
290
latest_time_prioq_compare(const void * a,const void * b)291 static int latest_time_prioq_compare(const void *a, const void *b) {
292 return time_prioq_compare(a, b, time_event_source_latest);
293 }
294
exit_prioq_compare(const void * a,const void * b)295 static int exit_prioq_compare(const void *a, const void *b) {
296 const sd_event_source *x = a, *y = b;
297 int r;
298
299 assert(x->type == SOURCE_EXIT);
300 assert(y->type == SOURCE_EXIT);
301
302 /* Enabled ones first */
303 r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
304 if (r != 0)
305 return r;
306
307 /* Lower priority values first */
308 return CMP(x->priority, y->priority);
309 }
310
free_clock_data(struct clock_data * d)311 static void free_clock_data(struct clock_data *d) {
312 assert(d);
313 assert(d->wakeup == WAKEUP_CLOCK_DATA);
314
315 safe_close(d->fd);
316 prioq_free(d->earliest);
317 prioq_free(d->latest);
318 }
319
event_free(sd_event * e)320 static sd_event *event_free(sd_event *e) {
321 sd_event_source *s;
322
323 assert(e);
324
325 while ((s = e->sources)) {
326 assert(s->floating);
327 source_disconnect(s);
328 sd_event_source_unref(s);
329 }
330
331 assert(e->n_sources == 0);
332
333 if (e->default_event_ptr)
334 *(e->default_event_ptr) = NULL;
335
336 safe_close(e->epoll_fd);
337 safe_close(e->watchdog_fd);
338
339 free_clock_data(&e->realtime);
340 free_clock_data(&e->boottime);
341 free_clock_data(&e->monotonic);
342 free_clock_data(&e->realtime_alarm);
343 free_clock_data(&e->boottime_alarm);
344
345 prioq_free(e->pending);
346 prioq_free(e->prepare);
347 prioq_free(e->exit);
348
349 free(e->signal_sources);
350 hashmap_free(e->signal_data);
351
352 hashmap_free(e->inotify_data);
353
354 hashmap_free(e->child_sources);
355 set_free(e->post_sources);
356
357 free(e->event_queue);
358
359 return mfree(e);
360 }
361
sd_event_new(sd_event ** ret)362 _public_ int sd_event_new(sd_event** ret) {
363 sd_event *e;
364 int r;
365
366 assert_return(ret, -EINVAL);
367
368 e = new(sd_event, 1);
369 if (!e)
370 return -ENOMEM;
371
372 *e = (sd_event) {
373 .n_ref = 1,
374 .epoll_fd = -1,
375 .watchdog_fd = -1,
376 .realtime.wakeup = WAKEUP_CLOCK_DATA,
377 .realtime.fd = -1,
378 .realtime.next = USEC_INFINITY,
379 .boottime.wakeup = WAKEUP_CLOCK_DATA,
380 .boottime.fd = -1,
381 .boottime.next = USEC_INFINITY,
382 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
383 .monotonic.fd = -1,
384 .monotonic.next = USEC_INFINITY,
385 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
386 .realtime_alarm.fd = -1,
387 .realtime_alarm.next = USEC_INFINITY,
388 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
389 .boottime_alarm.fd = -1,
390 .boottime_alarm.next = USEC_INFINITY,
391 .perturb = USEC_INFINITY,
392 .original_pid = getpid_cached(),
393 };
394
395 r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
396 if (r < 0)
397 goto fail;
398
399 e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
400 if (e->epoll_fd < 0) {
401 r = -errno;
402 goto fail;
403 }
404
405 e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
406
407 if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
408 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 … 2^63 us will be logged every 5s.");
409 e->profile_delays = true;
410 }
411
412 *ret = e;
413 return 0;
414
415 fail:
416 event_free(e);
417 return r;
418 }
419
420 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
421
sd_event_source_disable_unref(sd_event_source * s)422 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
423 if (s)
424 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
425 return sd_event_source_unref(s);
426 }
427
event_pid_changed(sd_event * e)428 static bool event_pid_changed(sd_event *e) {
429 assert(e);
430
431 /* We don't support people creating an event loop and keeping
432 * it around over a fork(). Let's complain. */
433
434 return e->original_pid != getpid_cached();
435 }
436
source_io_unregister(sd_event_source * s)437 static void source_io_unregister(sd_event_source *s) {
438 assert(s);
439 assert(s->type == SOURCE_IO);
440
441 if (event_pid_changed(s->event))
442 return;
443
444 if (!s->io.registered)
445 return;
446
447 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
448 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
449 strna(s->description), event_source_type_to_string(s->type));
450
451 s->io.registered = false;
452 }
453
source_io_register(sd_event_source * s,int enabled,uint32_t events)454 static int source_io_register(
455 sd_event_source *s,
456 int enabled,
457 uint32_t events) {
458
459 assert(s);
460 assert(s->type == SOURCE_IO);
461 assert(enabled != SD_EVENT_OFF);
462
463 struct epoll_event ev = {
464 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
465 .data.ptr = s,
466 };
467
468 if (epoll_ctl(s->event->epoll_fd,
469 s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
470 s->io.fd, &ev) < 0)
471 return -errno;
472
473 s->io.registered = true;
474
475 return 0;
476 }
477
source_child_pidfd_unregister(sd_event_source * s)478 static void source_child_pidfd_unregister(sd_event_source *s) {
479 assert(s);
480 assert(s->type == SOURCE_CHILD);
481
482 if (event_pid_changed(s->event))
483 return;
484
485 if (!s->child.registered)
486 return;
487
488 if (EVENT_SOURCE_WATCH_PIDFD(s))
489 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
490 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
491 strna(s->description), event_source_type_to_string(s->type));
492
493 s->child.registered = false;
494 }
495
source_child_pidfd_register(sd_event_source * s,int enabled)496 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
497 assert(s);
498 assert(s->type == SOURCE_CHILD);
499 assert(enabled != SD_EVENT_OFF);
500
501 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
502 struct epoll_event ev = {
503 .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
504 .data.ptr = s,
505 };
506
507 if (epoll_ctl(s->event->epoll_fd,
508 s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
509 s->child.pidfd, &ev) < 0)
510 return -errno;
511 }
512
513 s->child.registered = true;
514 return 0;
515 }
516
event_source_type_to_clock(EventSourceType t)517 static clockid_t event_source_type_to_clock(EventSourceType t) {
518
519 switch (t) {
520
521 case SOURCE_TIME_REALTIME:
522 return CLOCK_REALTIME;
523
524 case SOURCE_TIME_BOOTTIME:
525 return CLOCK_BOOTTIME;
526
527 case SOURCE_TIME_MONOTONIC:
528 return CLOCK_MONOTONIC;
529
530 case SOURCE_TIME_REALTIME_ALARM:
531 return CLOCK_REALTIME_ALARM;
532
533 case SOURCE_TIME_BOOTTIME_ALARM:
534 return CLOCK_BOOTTIME_ALARM;
535
536 default:
537 return (clockid_t) -1;
538 }
539 }
540
clock_to_event_source_type(clockid_t clock)541 static EventSourceType clock_to_event_source_type(clockid_t clock) {
542
543 switch (clock) {
544
545 case CLOCK_REALTIME:
546 return SOURCE_TIME_REALTIME;
547
548 case CLOCK_BOOTTIME:
549 return SOURCE_TIME_BOOTTIME;
550
551 case CLOCK_MONOTONIC:
552 return SOURCE_TIME_MONOTONIC;
553
554 case CLOCK_REALTIME_ALARM:
555 return SOURCE_TIME_REALTIME_ALARM;
556
557 case CLOCK_BOOTTIME_ALARM:
558 return SOURCE_TIME_BOOTTIME_ALARM;
559
560 default:
561 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
562 }
563 }
564
event_get_clock_data(sd_event * e,EventSourceType t)565 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
566 assert(e);
567
568 switch (t) {
569
570 case SOURCE_TIME_REALTIME:
571 return &e->realtime;
572
573 case SOURCE_TIME_BOOTTIME:
574 return &e->boottime;
575
576 case SOURCE_TIME_MONOTONIC:
577 return &e->monotonic;
578
579 case SOURCE_TIME_REALTIME_ALARM:
580 return &e->realtime_alarm;
581
582 case SOURCE_TIME_BOOTTIME_ALARM:
583 return &e->boottime_alarm;
584
585 default:
586 return NULL;
587 }
588 }
589
event_free_signal_data(sd_event * e,struct signal_data * d)590 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
591 assert(e);
592
593 if (!d)
594 return;
595
596 hashmap_remove(e->signal_data, &d->priority);
597 safe_close(d->fd);
598 free(d);
599 }
600
event_make_signal_data(sd_event * e,int sig,struct signal_data ** ret)601 static int event_make_signal_data(
602 sd_event *e,
603 int sig,
604 struct signal_data **ret) {
605
606 struct signal_data *d;
607 bool added = false;
608 sigset_t ss_copy;
609 int64_t priority;
610 int r;
611
612 assert(e);
613
614 if (event_pid_changed(e))
615 return -ECHILD;
616
617 if (e->signal_sources && e->signal_sources[sig])
618 priority = e->signal_sources[sig]->priority;
619 else
620 priority = SD_EVENT_PRIORITY_NORMAL;
621
622 d = hashmap_get(e->signal_data, &priority);
623 if (d) {
624 if (sigismember(&d->sigset, sig) > 0) {
625 if (ret)
626 *ret = d;
627 return 0;
628 }
629 } else {
630 d = new(struct signal_data, 1);
631 if (!d)
632 return -ENOMEM;
633
634 *d = (struct signal_data) {
635 .wakeup = WAKEUP_SIGNAL_DATA,
636 .fd = -1,
637 .priority = priority,
638 };
639
640 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
641 if (r < 0) {
642 free(d);
643 return r;
644 }
645
646 added = true;
647 }
648
649 ss_copy = d->sigset;
650 assert_se(sigaddset(&ss_copy, sig) >= 0);
651
652 r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
653 if (r < 0) {
654 r = -errno;
655 goto fail;
656 }
657
658 d->sigset = ss_copy;
659
660 if (d->fd >= 0) {
661 if (ret)
662 *ret = d;
663 return 0;
664 }
665
666 d->fd = fd_move_above_stdio(r);
667
668 struct epoll_event ev = {
669 .events = EPOLLIN,
670 .data.ptr = d,
671 };
672
673 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
674 r = -errno;
675 goto fail;
676 }
677
678 if (ret)
679 *ret = d;
680
681 return 0;
682
683 fail:
684 if (added)
685 event_free_signal_data(e, d);
686
687 return r;
688 }
689
event_unmask_signal_data(sd_event * e,struct signal_data * d,int sig)690 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
691 assert(e);
692 assert(d);
693
694 /* Turns off the specified signal in the signal data
695 * object. If the signal mask of the object becomes empty that
696 * way removes it. */
697
698 if (sigismember(&d->sigset, sig) == 0)
699 return;
700
701 assert_se(sigdelset(&d->sigset, sig) >= 0);
702
703 if (sigisemptyset(&d->sigset)) {
704 /* If all the mask is all-zero we can get rid of the structure */
705 event_free_signal_data(e, d);
706 return;
707 }
708
709 if (event_pid_changed(e))
710 return;
711
712 assert(d->fd >= 0);
713
714 if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
715 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
716 }
717
event_gc_signal_data(sd_event * e,const int64_t * priority,int sig)718 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
719 struct signal_data *d;
720 static const int64_t zero_priority = 0;
721
722 assert(e);
723
724 /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
725 * and possibly drop the signalfd for it. */
726
727 if (sig == SIGCHLD &&
728 e->n_online_child_sources > 0)
729 return;
730
731 if (e->signal_sources &&
732 e->signal_sources[sig] &&
733 event_source_is_online(e->signal_sources[sig]))
734 return;
735
736 /*
737 * The specified signal might be enabled in three different queues:
738 *
739 * 1) the one that belongs to the priority passed (if it is non-NULL)
740 * 2) the one that belongs to the priority of the event source of the signal (if there is one)
741 * 3) the 0 priority (to cover the SIGCHLD case)
742 *
743 * Hence, let's remove it from all three here.
744 */
745
746 if (priority) {
747 d = hashmap_get(e->signal_data, priority);
748 if (d)
749 event_unmask_signal_data(e, d, sig);
750 }
751
752 if (e->signal_sources && e->signal_sources[sig]) {
753 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
754 if (d)
755 event_unmask_signal_data(e, d, sig);
756 }
757
758 d = hashmap_get(e->signal_data, &zero_priority);
759 if (d)
760 event_unmask_signal_data(e, d, sig);
761 }
762
event_source_pp_prioq_reshuffle(sd_event_source * s)763 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
764 assert(s);
765
766 /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
767 * they are enabled/disabled or marked pending and such. */
768
769 if (s->pending)
770 prioq_reshuffle(s->event->pending, s, &s->pending_index);
771
772 if (s->prepare)
773 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
774 }
775
event_source_time_prioq_reshuffle(sd_event_source * s)776 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
777 struct clock_data *d;
778
779 assert(s);
780
781 /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
782 * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
783 * properly again. */
784
785 if (s->ratelimited)
786 d = &s->event->monotonic;
787 else if (EVENT_SOURCE_IS_TIME(s->type))
788 assert_se(d = event_get_clock_data(s->event, s->type));
789 else
790 return; /* no-op for an event source which is neither a timer nor ratelimited. */
791
792 prioq_reshuffle(d->earliest, s, &s->earliest_index);
793 prioq_reshuffle(d->latest, s, &s->latest_index);
794 d->needs_rearm = true;
795 }
796
event_source_time_prioq_remove(sd_event_source * s,struct clock_data * d)797 static void event_source_time_prioq_remove(
798 sd_event_source *s,
799 struct clock_data *d) {
800
801 assert(s);
802 assert(d);
803
804 prioq_remove(d->earliest, s, &s->earliest_index);
805 prioq_remove(d->latest, s, &s->latest_index);
806 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
807 d->needs_rearm = true;
808 }
809
source_disconnect(sd_event_source * s)810 static void source_disconnect(sd_event_source *s) {
811 sd_event *event;
812
813 assert(s);
814
815 if (!s->event)
816 return;
817
818 assert(s->event->n_sources > 0);
819
820 switch (s->type) {
821
822 case SOURCE_IO:
823 if (s->io.fd >= 0)
824 source_io_unregister(s);
825
826 break;
827
828 case SOURCE_TIME_REALTIME:
829 case SOURCE_TIME_BOOTTIME:
830 case SOURCE_TIME_MONOTONIC:
831 case SOURCE_TIME_REALTIME_ALARM:
832 case SOURCE_TIME_BOOTTIME_ALARM:
833 /* Only remove this event source from the time event source here if it is not ratelimited. If
834 * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
835 * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
836
837 if (!s->ratelimited) {
838 struct clock_data *d;
839 assert_se(d = event_get_clock_data(s->event, s->type));
840 event_source_time_prioq_remove(s, d);
841 }
842
843 break;
844
845 case SOURCE_SIGNAL:
846 if (s->signal.sig > 0) {
847
848 if (s->event->signal_sources)
849 s->event->signal_sources[s->signal.sig] = NULL;
850
851 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
852 }
853
854 break;
855
856 case SOURCE_CHILD:
857 if (event_pid_changed(s->event))
858 s->child.process_owned = false;
859
860 if (s->child.pid > 0) {
861 if (event_source_is_online(s)) {
862 assert(s->event->n_online_child_sources > 0);
863 s->event->n_online_child_sources--;
864 }
865
866 (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
867 }
868
869 if (EVENT_SOURCE_WATCH_PIDFD(s))
870 source_child_pidfd_unregister(s);
871 else
872 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
873
874 break;
875
876 case SOURCE_DEFER:
877 /* nothing */
878 break;
879
880 case SOURCE_POST:
881 set_remove(s->event->post_sources, s);
882 break;
883
884 case SOURCE_EXIT:
885 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
886 break;
887
888 case SOURCE_INOTIFY: {
889 struct inode_data *inode_data;
890
891 inode_data = s->inotify.inode_data;
892 if (inode_data) {
893 struct inotify_data *inotify_data;
894 assert_se(inotify_data = inode_data->inotify_data);
895
896 /* Detach this event source from the inode object */
897 LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
898 s->inotify.inode_data = NULL;
899
900 if (s->pending) {
901 assert(inotify_data->n_pending > 0);
902 inotify_data->n_pending--;
903 }
904
905 /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
906 * continued to being watched. That's because inotify doesn't really have an API for that: we
907 * can only change watch masks with access to the original inode either by fd or by path. But
908 * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
909 * continuously and keeping the mount busy which we can't really do. We could reconstruct the
910 * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
911 * there), but given the need for open_by_handle_at() which is privileged and not universally
912 * available this would be quite an incomplete solution. Hence we go the other way, leave the
913 * mask set, even if it is not minimized now, and ignore all events we aren't interested in
914 * anymore after reception. Yes, this sucks, but … Linux … */
915
916 /* Maybe release the inode data (and its inotify) */
917 event_gc_inode_data(s->event, inode_data);
918 }
919
920 break;
921 }
922
923 default:
924 assert_not_reached();
925 }
926
927 if (s->pending)
928 prioq_remove(s->event->pending, s, &s->pending_index);
929
930 if (s->prepare)
931 prioq_remove(s->event->prepare, s, &s->prepare_index);
932
933 if (s->ratelimited)
934 event_source_time_prioq_remove(s, &s->event->monotonic);
935
936 event = TAKE_PTR(s->event);
937 LIST_REMOVE(sources, event->sources, s);
938 event->n_sources--;
939
940 /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
941 * pidfd associated with this event source, which we'll do only on source_free(). */
942
943 if (!s->floating)
944 sd_event_unref(event);
945 }
946
source_free(sd_event_source * s)947 static sd_event_source* source_free(sd_event_source *s) {
948 assert(s);
949
950 source_disconnect(s);
951
952 if (s->type == SOURCE_IO && s->io.owned)
953 s->io.fd = safe_close(s->io.fd);
954
955 if (s->type == SOURCE_CHILD) {
956 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
957
958 if (s->child.process_owned) {
959
960 if (!s->child.exited) {
961 bool sent = false;
962
963 if (s->child.pidfd >= 0) {
964 if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
965 if (errno == ESRCH) /* Already dead */
966 sent = true;
967 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
968 log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
969 s->child.pid);
970 } else
971 sent = true;
972 }
973
974 if (!sent)
975 if (kill(s->child.pid, SIGKILL) < 0)
976 if (errno != ESRCH) /* Already dead */
977 log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
978 s->child.pid);
979 }
980
981 if (!s->child.waited) {
982 siginfo_t si = {};
983
984 /* Reap the child if we can */
985 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
986 }
987 }
988
989 if (s->child.pidfd_owned)
990 s->child.pidfd = safe_close(s->child.pidfd);
991 }
992
993 if (s->destroy_callback)
994 s->destroy_callback(s->userdata);
995
996 free(s->description);
997 return mfree(s);
998 }
999 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1000
source_set_pending(sd_event_source * s,bool b)1001 static int source_set_pending(sd_event_source *s, bool b) {
1002 int r;
1003
1004 assert(s);
1005 assert(s->type != SOURCE_EXIT);
1006
1007 if (s->pending == b)
1008 return 0;
1009
1010 s->pending = b;
1011
1012 if (b) {
1013 s->pending_iteration = s->event->iteration;
1014
1015 r = prioq_put(s->event->pending, s, &s->pending_index);
1016 if (r < 0) {
1017 s->pending = false;
1018 return r;
1019 }
1020 } else
1021 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1022
1023 if (EVENT_SOURCE_IS_TIME(s->type))
1024 event_source_time_prioq_reshuffle(s);
1025
1026 if (s->type == SOURCE_SIGNAL && !b) {
1027 struct signal_data *d;
1028
1029 d = hashmap_get(s->event->signal_data, &s->priority);
1030 if (d && d->current == s)
1031 d->current = NULL;
1032 }
1033
1034 if (s->type == SOURCE_INOTIFY) {
1035
1036 assert(s->inotify.inode_data);
1037 assert(s->inotify.inode_data->inotify_data);
1038
1039 if (b)
1040 s->inotify.inode_data->inotify_data->n_pending ++;
1041 else {
1042 assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1043 s->inotify.inode_data->inotify_data->n_pending --;
1044 }
1045 }
1046
1047 return 1;
1048 }
1049
source_new(sd_event * e,bool floating,EventSourceType type)1050 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1051 sd_event_source *s;
1052
1053 assert(e);
1054
1055 s = new(sd_event_source, 1);
1056 if (!s)
1057 return NULL;
1058
1059 *s = (struct sd_event_source) {
1060 .n_ref = 1,
1061 .event = e,
1062 .floating = floating,
1063 .type = type,
1064 .pending_index = PRIOQ_IDX_NULL,
1065 .prepare_index = PRIOQ_IDX_NULL,
1066 };
1067
1068 if (!floating)
1069 sd_event_ref(e);
1070
1071 LIST_PREPEND(sources, e->sources, s);
1072 e->n_sources++;
1073
1074 return s;
1075 }
1076
io_exit_callback(sd_event_source * s,int fd,uint32_t revents,void * userdata)1077 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1078 assert(s);
1079
1080 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1081 }
1082
sd_event_add_io(sd_event * e,sd_event_source ** ret,int fd,uint32_t events,sd_event_io_handler_t callback,void * userdata)1083 _public_ int sd_event_add_io(
1084 sd_event *e,
1085 sd_event_source **ret,
1086 int fd,
1087 uint32_t events,
1088 sd_event_io_handler_t callback,
1089 void *userdata) {
1090
1091 _cleanup_(source_freep) sd_event_source *s = NULL;
1092 int r;
1093
1094 assert_return(e, -EINVAL);
1095 assert_return(e = event_resolve(e), -ENOPKG);
1096 assert_return(fd >= 0, -EBADF);
1097 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1098 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1099 assert_return(!event_pid_changed(e), -ECHILD);
1100
1101 if (!callback)
1102 callback = io_exit_callback;
1103
1104 s = source_new(e, !ret, SOURCE_IO);
1105 if (!s)
1106 return -ENOMEM;
1107
1108 s->wakeup = WAKEUP_EVENT_SOURCE;
1109 s->io.fd = fd;
1110 s->io.events = events;
1111 s->io.callback = callback;
1112 s->userdata = userdata;
1113 s->enabled = SD_EVENT_ON;
1114
1115 r = source_io_register(s, s->enabled, events);
1116 if (r < 0)
1117 return r;
1118
1119 if (ret)
1120 *ret = s;
1121 TAKE_PTR(s);
1122
1123 return 0;
1124 }
1125
initialize_perturb(sd_event * e)1126 static void initialize_perturb(sd_event *e) {
1127 sd_id128_t bootid = {};
1128
1129 /* When we sleep for longer, we try to realign the wakeup to
1130 the same time within each minute/second/250ms, so that
1131 events all across the system can be coalesced into a single
1132 CPU wakeup. However, let's take some system-specific
1133 randomness for this value, so that in a network of systems
1134 with synced clocks timer events are distributed a
1135 bit. Here, we calculate a perturbation usec offset from the
1136 boot ID. */
1137
1138 if (_likely_(e->perturb != USEC_INFINITY))
1139 return;
1140
1141 if (sd_id128_get_boot(&bootid) >= 0)
1142 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1143 }
1144
event_setup_timer_fd(sd_event * e,struct clock_data * d,clockid_t clock)1145 static int event_setup_timer_fd(
1146 sd_event *e,
1147 struct clock_data *d,
1148 clockid_t clock) {
1149
1150 assert(e);
1151 assert(d);
1152
1153 if (_likely_(d->fd >= 0))
1154 return 0;
1155
1156 _cleanup_close_ int fd = -1;
1157
1158 fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1159 if (fd < 0)
1160 return -errno;
1161
1162 fd = fd_move_above_stdio(fd);
1163
1164 struct epoll_event ev = {
1165 .events = EPOLLIN,
1166 .data.ptr = d,
1167 };
1168
1169 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1170 return -errno;
1171
1172 d->fd = TAKE_FD(fd);
1173 return 0;
1174 }
1175
time_exit_callback(sd_event_source * s,uint64_t usec,void * userdata)1176 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1177 assert(s);
1178
1179 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1180 }
1181
setup_clock_data(sd_event * e,struct clock_data * d,clockid_t clock)1182 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1183 int r;
1184
1185 assert(d);
1186
1187 if (d->fd < 0) {
1188 r = event_setup_timer_fd(e, d, clock);
1189 if (r < 0)
1190 return r;
1191 }
1192
1193 r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1194 if (r < 0)
1195 return r;
1196
1197 r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1198 if (r < 0)
1199 return r;
1200
1201 return 0;
1202 }
1203
event_source_time_prioq_put(sd_event_source * s,struct clock_data * d)1204 static int event_source_time_prioq_put(
1205 sd_event_source *s,
1206 struct clock_data *d) {
1207
1208 int r;
1209
1210 assert(s);
1211 assert(d);
1212 assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1213
1214 r = prioq_put(d->earliest, s, &s->earliest_index);
1215 if (r < 0)
1216 return r;
1217
1218 r = prioq_put(d->latest, s, &s->latest_index);
1219 if (r < 0) {
1220 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1221 s->earliest_index = PRIOQ_IDX_NULL;
1222 return r;
1223 }
1224
1225 d->needs_rearm = true;
1226 return 0;
1227 }
1228
sd_event_add_time(sd_event * e,sd_event_source ** ret,clockid_t clock,uint64_t usec,uint64_t accuracy,sd_event_time_handler_t callback,void * userdata)1229 _public_ int sd_event_add_time(
1230 sd_event *e,
1231 sd_event_source **ret,
1232 clockid_t clock,
1233 uint64_t usec,
1234 uint64_t accuracy,
1235 sd_event_time_handler_t callback,
1236 void *userdata) {
1237
1238 EventSourceType type;
1239 _cleanup_(source_freep) sd_event_source *s = NULL;
1240 struct clock_data *d;
1241 int r;
1242
1243 assert_return(e, -EINVAL);
1244 assert_return(e = event_resolve(e), -ENOPKG);
1245 assert_return(accuracy != UINT64_MAX, -EINVAL);
1246 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1247 assert_return(!event_pid_changed(e), -ECHILD);
1248
1249 if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1250 return -EOPNOTSUPP;
1251
1252 type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1253 if (type < 0)
1254 return -EOPNOTSUPP;
1255
1256 if (!callback)
1257 callback = time_exit_callback;
1258
1259 assert_se(d = event_get_clock_data(e, type));
1260
1261 r = setup_clock_data(e, d, clock);
1262 if (r < 0)
1263 return r;
1264
1265 s = source_new(e, !ret, type);
1266 if (!s)
1267 return -ENOMEM;
1268
1269 s->time.next = usec;
1270 s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1271 s->time.callback = callback;
1272 s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1273 s->userdata = userdata;
1274 s->enabled = SD_EVENT_ONESHOT;
1275
1276 r = event_source_time_prioq_put(s, d);
1277 if (r < 0)
1278 return r;
1279
1280 if (ret)
1281 *ret = s;
1282 TAKE_PTR(s);
1283
1284 return 0;
1285 }
1286
sd_event_add_time_relative(sd_event * e,sd_event_source ** ret,clockid_t clock,uint64_t usec,uint64_t accuracy,sd_event_time_handler_t callback,void * userdata)1287 _public_ int sd_event_add_time_relative(
1288 sd_event *e,
1289 sd_event_source **ret,
1290 clockid_t clock,
1291 uint64_t usec,
1292 uint64_t accuracy,
1293 sd_event_time_handler_t callback,
1294 void *userdata) {
1295
1296 usec_t t;
1297 int r;
1298
1299 /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1300 * checks for overflow. */
1301
1302 r = sd_event_now(e, clock, &t);
1303 if (r < 0)
1304 return r;
1305
1306 if (usec >= USEC_INFINITY - t)
1307 return -EOVERFLOW;
1308
1309 return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1310 }
1311
signal_exit_callback(sd_event_source * s,const struct signalfd_siginfo * si,void * userdata)1312 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1313 assert(s);
1314
1315 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1316 }
1317
sd_event_add_signal(sd_event * e,sd_event_source ** ret,int sig,sd_event_signal_handler_t callback,void * userdata)1318 _public_ int sd_event_add_signal(
1319 sd_event *e,
1320 sd_event_source **ret,
1321 int sig,
1322 sd_event_signal_handler_t callback,
1323 void *userdata) {
1324
1325 _cleanup_(source_freep) sd_event_source *s = NULL;
1326 struct signal_data *d;
1327 int r;
1328
1329 assert_return(e, -EINVAL);
1330 assert_return(e = event_resolve(e), -ENOPKG);
1331 assert_return(SIGNAL_VALID(sig), -EINVAL);
1332 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1333 assert_return(!event_pid_changed(e), -ECHILD);
1334
1335 if (!callback)
1336 callback = signal_exit_callback;
1337
1338 r = signal_is_blocked(sig);
1339 if (r < 0)
1340 return r;
1341 if (r == 0)
1342 return -EBUSY;
1343
1344 if (!e->signal_sources) {
1345 e->signal_sources = new0(sd_event_source*, _NSIG);
1346 if (!e->signal_sources)
1347 return -ENOMEM;
1348 } else if (e->signal_sources[sig])
1349 return -EBUSY;
1350
1351 s = source_new(e, !ret, SOURCE_SIGNAL);
1352 if (!s)
1353 return -ENOMEM;
1354
1355 s->signal.sig = sig;
1356 s->signal.callback = callback;
1357 s->userdata = userdata;
1358 s->enabled = SD_EVENT_ON;
1359
1360 e->signal_sources[sig] = s;
1361
1362 r = event_make_signal_data(e, sig, &d);
1363 if (r < 0)
1364 return r;
1365
1366 /* Use the signal name as description for the event source by default */
1367 (void) sd_event_source_set_description(s, signal_to_string(sig));
1368
1369 if (ret)
1370 *ret = s;
1371 TAKE_PTR(s);
1372
1373 return 0;
1374 }
1375
child_exit_callback(sd_event_source * s,const siginfo_t * si,void * userdata)1376 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1377 assert(s);
1378
1379 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1380 }
1381
shall_use_pidfd(void)1382 static bool shall_use_pidfd(void) {
1383 /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1384 return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1385 }
1386
sd_event_add_child(sd_event * e,sd_event_source ** ret,pid_t pid,int options,sd_event_child_handler_t callback,void * userdata)1387 _public_ int sd_event_add_child(
1388 sd_event *e,
1389 sd_event_source **ret,
1390 pid_t pid,
1391 int options,
1392 sd_event_child_handler_t callback,
1393 void *userdata) {
1394
1395 _cleanup_(source_freep) sd_event_source *s = NULL;
1396 int r;
1397
1398 assert_return(e, -EINVAL);
1399 assert_return(e = event_resolve(e), -ENOPKG);
1400 assert_return(pid > 1, -EINVAL);
1401 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1402 assert_return(options != 0, -EINVAL);
1403 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1404 assert_return(!event_pid_changed(e), -ECHILD);
1405
1406 if (!callback)
1407 callback = child_exit_callback;
1408
1409 if (e->n_online_child_sources == 0) {
1410 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1411 * for compatibility with pre-pidfd and because we don't want the reap the child processes
1412 * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1413 * take effect.
1414 *
1415 * (As an optimization we only do this check on the first child event source created.) */
1416 r = signal_is_blocked(SIGCHLD);
1417 if (r < 0)
1418 return r;
1419 if (r == 0)
1420 return -EBUSY;
1421 }
1422
1423 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1424 if (r < 0)
1425 return r;
1426
1427 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1428 return -EBUSY;
1429
1430 s = source_new(e, !ret, SOURCE_CHILD);
1431 if (!s)
1432 return -ENOMEM;
1433
1434 s->wakeup = WAKEUP_EVENT_SOURCE;
1435 s->child.options = options;
1436 s->child.callback = callback;
1437 s->userdata = userdata;
1438 s->enabled = SD_EVENT_ONESHOT;
1439
1440 /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1441 * pin the PID, and make regular waitid() handling race-free. */
1442
1443 if (shall_use_pidfd()) {
1444 s->child.pidfd = pidfd_open(pid, 0);
1445 if (s->child.pidfd < 0) {
1446 /* Propagate errors unless the syscall is not supported or blocked */
1447 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1448 return -errno;
1449 } else
1450 s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1451 } else
1452 s->child.pidfd = -1;
1453
1454 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1455 /* We have a pidfd and we only want to watch for exit */
1456 r = source_child_pidfd_register(s, s->enabled);
1457 if (r < 0)
1458 return r;
1459
1460 } else {
1461 /* We have no pidfd or we shall wait for some other event than WEXITED */
1462 r = event_make_signal_data(e, SIGCHLD, NULL);
1463 if (r < 0)
1464 return r;
1465
1466 e->need_process_child = true;
1467 }
1468
1469 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1470 if (r < 0)
1471 return r;
1472
1473 /* These must be done after everything succeeds. */
1474 s->child.pid = pid;
1475 e->n_online_child_sources++;
1476
1477 if (ret)
1478 *ret = s;
1479 TAKE_PTR(s);
1480 return 0;
1481 }
1482
sd_event_add_child_pidfd(sd_event * e,sd_event_source ** ret,int pidfd,int options,sd_event_child_handler_t callback,void * userdata)1483 _public_ int sd_event_add_child_pidfd(
1484 sd_event *e,
1485 sd_event_source **ret,
1486 int pidfd,
1487 int options,
1488 sd_event_child_handler_t callback,
1489 void *userdata) {
1490
1491
1492 _cleanup_(source_freep) sd_event_source *s = NULL;
1493 pid_t pid;
1494 int r;
1495
1496 assert_return(e, -EINVAL);
1497 assert_return(e = event_resolve(e), -ENOPKG);
1498 assert_return(pidfd >= 0, -EBADF);
1499 assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1500 assert_return(options != 0, -EINVAL);
1501 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1502 assert_return(!event_pid_changed(e), -ECHILD);
1503
1504 if (!callback)
1505 callback = child_exit_callback;
1506
1507 if (e->n_online_child_sources == 0) {
1508 r = signal_is_blocked(SIGCHLD);
1509 if (r < 0)
1510 return r;
1511 if (r == 0)
1512 return -EBUSY;
1513 }
1514
1515 r = hashmap_ensure_allocated(&e->child_sources, NULL);
1516 if (r < 0)
1517 return r;
1518
1519 r = pidfd_get_pid(pidfd, &pid);
1520 if (r < 0)
1521 return r;
1522
1523 if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1524 return -EBUSY;
1525
1526 s = source_new(e, !ret, SOURCE_CHILD);
1527 if (!s)
1528 return -ENOMEM;
1529
1530 s->wakeup = WAKEUP_EVENT_SOURCE;
1531 s->child.pidfd = pidfd;
1532 s->child.pid = pid;
1533 s->child.options = options;
1534 s->child.callback = callback;
1535 s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1536 s->userdata = userdata;
1537 s->enabled = SD_EVENT_ONESHOT;
1538
1539 r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1540 if (r < 0)
1541 return r;
1542
1543 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1544 /* We only want to watch for WEXITED */
1545 r = source_child_pidfd_register(s, s->enabled);
1546 if (r < 0)
1547 return r;
1548 } else {
1549 /* We shall wait for some other event than WEXITED */
1550 r = event_make_signal_data(e, SIGCHLD, NULL);
1551 if (r < 0)
1552 return r;
1553
1554 e->need_process_child = true;
1555 }
1556
1557 e->n_online_child_sources++;
1558
1559 if (ret)
1560 *ret = s;
1561 TAKE_PTR(s);
1562 return 0;
1563 }
1564
generic_exit_callback(sd_event_source * s,void * userdata)1565 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1566 assert(s);
1567
1568 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1569 }
1570
sd_event_add_defer(sd_event * e,sd_event_source ** ret,sd_event_handler_t callback,void * userdata)1571 _public_ int sd_event_add_defer(
1572 sd_event *e,
1573 sd_event_source **ret,
1574 sd_event_handler_t callback,
1575 void *userdata) {
1576
1577 _cleanup_(source_freep) sd_event_source *s = NULL;
1578 int r;
1579
1580 assert_return(e, -EINVAL);
1581 assert_return(e = event_resolve(e), -ENOPKG);
1582 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1583 assert_return(!event_pid_changed(e), -ECHILD);
1584
1585 if (!callback)
1586 callback = generic_exit_callback;
1587
1588 s = source_new(e, !ret, SOURCE_DEFER);
1589 if (!s)
1590 return -ENOMEM;
1591
1592 s->defer.callback = callback;
1593 s->userdata = userdata;
1594 s->enabled = SD_EVENT_ONESHOT;
1595
1596 r = source_set_pending(s, true);
1597 if (r < 0)
1598 return r;
1599
1600 if (ret)
1601 *ret = s;
1602 TAKE_PTR(s);
1603
1604 return 0;
1605 }
1606
sd_event_add_post(sd_event * e,sd_event_source ** ret,sd_event_handler_t callback,void * userdata)1607 _public_ int sd_event_add_post(
1608 sd_event *e,
1609 sd_event_source **ret,
1610 sd_event_handler_t callback,
1611 void *userdata) {
1612
1613 _cleanup_(source_freep) sd_event_source *s = NULL;
1614 int r;
1615
1616 assert_return(e, -EINVAL);
1617 assert_return(e = event_resolve(e), -ENOPKG);
1618 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1619 assert_return(!event_pid_changed(e), -ECHILD);
1620
1621 if (!callback)
1622 callback = generic_exit_callback;
1623
1624 s = source_new(e, !ret, SOURCE_POST);
1625 if (!s)
1626 return -ENOMEM;
1627
1628 s->post.callback = callback;
1629 s->userdata = userdata;
1630 s->enabled = SD_EVENT_ON;
1631
1632 r = set_ensure_put(&e->post_sources, NULL, s);
1633 if (r < 0)
1634 return r;
1635 assert(r > 0);
1636
1637 if (ret)
1638 *ret = s;
1639 TAKE_PTR(s);
1640
1641 return 0;
1642 }
1643
sd_event_add_exit(sd_event * e,sd_event_source ** ret,sd_event_handler_t callback,void * userdata)1644 _public_ int sd_event_add_exit(
1645 sd_event *e,
1646 sd_event_source **ret,
1647 sd_event_handler_t callback,
1648 void *userdata) {
1649
1650 _cleanup_(source_freep) sd_event_source *s = NULL;
1651 int r;
1652
1653 assert_return(e, -EINVAL);
1654 assert_return(e = event_resolve(e), -ENOPKG);
1655 assert_return(callback, -EINVAL);
1656 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1657 assert_return(!event_pid_changed(e), -ECHILD);
1658
1659 r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1660 if (r < 0)
1661 return r;
1662
1663 s = source_new(e, !ret, SOURCE_EXIT);
1664 if (!s)
1665 return -ENOMEM;
1666
1667 s->exit.callback = callback;
1668 s->userdata = userdata;
1669 s->exit.prioq_index = PRIOQ_IDX_NULL;
1670 s->enabled = SD_EVENT_ONESHOT;
1671
1672 r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1673 if (r < 0)
1674 return r;
1675
1676 if (ret)
1677 *ret = s;
1678 TAKE_PTR(s);
1679
1680 return 0;
1681 }
1682
event_free_inotify_data(sd_event * e,struct inotify_data * d)1683 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1684 assert(e);
1685
1686 if (!d)
1687 return;
1688
1689 assert(hashmap_isempty(d->inodes));
1690 assert(hashmap_isempty(d->wd));
1691
1692 if (d->buffer_filled > 0)
1693 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
1694
1695 hashmap_free(d->inodes);
1696 hashmap_free(d->wd);
1697
1698 assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1699
1700 if (d->fd >= 0) {
1701 if (!event_pid_changed(e) &&
1702 epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
1703 log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1704
1705 safe_close(d->fd);
1706 }
1707 free(d);
1708 }
1709
event_make_inotify_data(sd_event * e,int64_t priority,struct inotify_data ** ret)1710 static int event_make_inotify_data(
1711 sd_event *e,
1712 int64_t priority,
1713 struct inotify_data **ret) {
1714
1715 _cleanup_close_ int fd = -1;
1716 struct inotify_data *d;
1717 int r;
1718
1719 assert(e);
1720
1721 d = hashmap_get(e->inotify_data, &priority);
1722 if (d) {
1723 if (ret)
1724 *ret = d;
1725 return 0;
1726 }
1727
1728 fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1729 if (fd < 0)
1730 return -errno;
1731
1732 fd = fd_move_above_stdio(fd);
1733
1734 d = new(struct inotify_data, 1);
1735 if (!d)
1736 return -ENOMEM;
1737
1738 *d = (struct inotify_data) {
1739 .wakeup = WAKEUP_INOTIFY_DATA,
1740 .fd = TAKE_FD(fd),
1741 .priority = priority,
1742 };
1743
1744 r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
1745 if (r < 0) {
1746 d->fd = safe_close(d->fd);
1747 free(d);
1748 return r;
1749 }
1750
1751 struct epoll_event ev = {
1752 .events = EPOLLIN,
1753 .data.ptr = d,
1754 };
1755
1756 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1757 r = -errno;
1758 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1759 * remove the fd from the epoll first, which we don't want as we couldn't
1760 * add it in the first place. */
1761 event_free_inotify_data(e, d);
1762 return r;
1763 }
1764
1765 if (ret)
1766 *ret = d;
1767
1768 return 1;
1769 }
1770
inode_data_compare(const struct inode_data * x,const struct inode_data * y)1771 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
1772 int r;
1773
1774 assert(x);
1775 assert(y);
1776
1777 r = CMP(x->dev, y->dev);
1778 if (r != 0)
1779 return r;
1780
1781 return CMP(x->ino, y->ino);
1782 }
1783
inode_data_hash_func(const struct inode_data * d,struct siphash * state)1784 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
1785 assert(d);
1786
1787 siphash24_compress(&d->dev, sizeof(d->dev), state);
1788 siphash24_compress(&d->ino, sizeof(d->ino), state);
1789 }
1790
1791 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
1792
event_free_inode_data(sd_event * e,struct inode_data * d)1793 static void event_free_inode_data(
1794 sd_event *e,
1795 struct inode_data *d) {
1796
1797 assert(e);
1798
1799 if (!d)
1800 return;
1801
1802 assert(!d->event_sources);
1803
1804 if (d->fd >= 0) {
1805 LIST_REMOVE(to_close, e->inode_data_to_close, d);
1806 safe_close(d->fd);
1807 }
1808
1809 if (d->inotify_data) {
1810
1811 if (d->wd >= 0) {
1812 if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
1813 /* So here's a problem. At the time this runs the watch descriptor might already be
1814 * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1815 * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1816 * likely case to happen. */
1817
1818 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1819 log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1820 }
1821
1822 assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1823 }
1824
1825 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1826 }
1827
1828 free(d);
1829 }
1830
event_gc_inotify_data(sd_event * e,struct inotify_data * d)1831 static void event_gc_inotify_data(
1832 sd_event *e,
1833 struct inotify_data *d) {
1834
1835 assert(e);
1836
1837 /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
1838 * any inode with it anymore, which in turn happens if no event source of this priority is interested
1839 * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
1840 * (under the expectation that the GC is called again once the counter is decremented). */
1841
1842 if (!d)
1843 return;
1844
1845 if (!hashmap_isempty(d->inodes))
1846 return;
1847
1848 if (d->n_busy > 0)
1849 return;
1850
1851 event_free_inotify_data(e, d);
1852 }
1853
event_gc_inode_data(sd_event * e,struct inode_data * d)1854 static void event_gc_inode_data(
1855 sd_event *e,
1856 struct inode_data *d) {
1857
1858 struct inotify_data *inotify_data;
1859
1860 assert(e);
1861
1862 if (!d)
1863 return;
1864
1865 if (d->event_sources)
1866 return;
1867
1868 inotify_data = d->inotify_data;
1869 event_free_inode_data(e, d);
1870
1871 event_gc_inotify_data(e, inotify_data);
1872 }
1873
event_make_inode_data(sd_event * e,struct inotify_data * inotify_data,dev_t dev,ino_t ino,struct inode_data ** ret)1874 static int event_make_inode_data(
1875 sd_event *e,
1876 struct inotify_data *inotify_data,
1877 dev_t dev,
1878 ino_t ino,
1879 struct inode_data **ret) {
1880
1881 struct inode_data *d, key;
1882 int r;
1883
1884 assert(e);
1885 assert(inotify_data);
1886
1887 key = (struct inode_data) {
1888 .ino = ino,
1889 .dev = dev,
1890 };
1891
1892 d = hashmap_get(inotify_data->inodes, &key);
1893 if (d) {
1894 if (ret)
1895 *ret = d;
1896
1897 return 0;
1898 }
1899
1900 r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1901 if (r < 0)
1902 return r;
1903
1904 d = new(struct inode_data, 1);
1905 if (!d)
1906 return -ENOMEM;
1907
1908 *d = (struct inode_data) {
1909 .dev = dev,
1910 .ino = ino,
1911 .wd = -1,
1912 .fd = -1,
1913 .inotify_data = inotify_data,
1914 };
1915
1916 r = hashmap_put(inotify_data->inodes, d, d);
1917 if (r < 0) {
1918 free(d);
1919 return r;
1920 }
1921
1922 if (ret)
1923 *ret = d;
1924
1925 return 1;
1926 }
1927
inode_data_determine_mask(struct inode_data * d)1928 static uint32_t inode_data_determine_mask(struct inode_data *d) {
1929 bool excl_unlink = true;
1930 uint32_t combined = 0;
1931
1932 assert(d);
1933
1934 /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
1935 * the IN_EXCL_UNLINK flag is ANDed instead.
1936 *
1937 * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
1938 * because we cannot change the mask anymore after the event source was created once, since the kernel has no
1939 * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
1940 * events we don't care for client-side. */
1941
1942 LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
1943
1944 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
1945 excl_unlink = false;
1946
1947 combined |= s->inotify.mask;
1948 }
1949
1950 return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
1951 }
1952
inode_data_realize_watch(sd_event * e,struct inode_data * d)1953 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
1954 uint32_t combined_mask;
1955 int wd, r;
1956
1957 assert(d);
1958 assert(d->fd >= 0);
1959
1960 combined_mask = inode_data_determine_mask(d);
1961
1962 if (d->wd >= 0 && combined_mask == d->combined_mask)
1963 return 0;
1964
1965 r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
1966 if (r < 0)
1967 return r;
1968
1969 wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
1970 if (wd < 0)
1971 return -errno;
1972
1973 if (d->wd < 0) {
1974 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
1975 if (r < 0) {
1976 (void) inotify_rm_watch(d->inotify_data->fd, wd);
1977 return r;
1978 }
1979
1980 d->wd = wd;
1981
1982 } else if (d->wd != wd) {
1983
1984 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
1985 (void) inotify_rm_watch(d->fd, wd);
1986 return -EINVAL;
1987 }
1988
1989 d->combined_mask = combined_mask;
1990 return 1;
1991 }
1992
inotify_exit_callback(sd_event_source * s,const struct inotify_event * event,void * userdata)1993 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
1994 assert(s);
1995
1996 return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1997 }
1998
event_add_inotify_fd_internal(sd_event * e,sd_event_source ** ret,int fd,bool donate,uint32_t mask,sd_event_inotify_handler_t callback,void * userdata)1999 static int event_add_inotify_fd_internal(
2000 sd_event *e,
2001 sd_event_source **ret,
2002 int fd,
2003 bool donate,
2004 uint32_t mask,
2005 sd_event_inotify_handler_t callback,
2006 void *userdata) {
2007
2008 _cleanup_close_ int donated_fd = donate ? fd : -1;
2009 _cleanup_(source_freep) sd_event_source *s = NULL;
2010 struct inotify_data *inotify_data = NULL;
2011 struct inode_data *inode_data = NULL;
2012 struct stat st;
2013 int r;
2014
2015 assert_return(e, -EINVAL);
2016 assert_return(e = event_resolve(e), -ENOPKG);
2017 assert_return(fd >= 0, -EBADF);
2018 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2019 assert_return(!event_pid_changed(e), -ECHILD);
2020
2021 if (!callback)
2022 callback = inotify_exit_callback;
2023
2024 /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2025 * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2026 * the user can't use them for us. */
2027 if (mask & IN_MASK_ADD)
2028 return -EINVAL;
2029
2030 if (fstat(fd, &st) < 0)
2031 return -errno;
2032
2033 s = source_new(e, !ret, SOURCE_INOTIFY);
2034 if (!s)
2035 return -ENOMEM;
2036
2037 s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2038 s->inotify.mask = mask;
2039 s->inotify.callback = callback;
2040 s->userdata = userdata;
2041
2042 /* Allocate an inotify object for this priority, and an inode object within it */
2043 r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2044 if (r < 0)
2045 return r;
2046
2047 r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2048 if (r < 0) {
2049 event_gc_inotify_data(e, inotify_data);
2050 return r;
2051 }
2052
2053 /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2054 * the event source, until then, for which we need the original inode. */
2055 if (inode_data->fd < 0) {
2056 if (donated_fd >= 0)
2057 inode_data->fd = TAKE_FD(donated_fd);
2058 else {
2059 inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2060 if (inode_data->fd < 0) {
2061 r = -errno;
2062 event_gc_inode_data(e, inode_data);
2063 return r;
2064 }
2065 }
2066
2067 LIST_PREPEND(to_close, e->inode_data_to_close, inode_data);
2068 }
2069
2070 /* Link our event source to the inode data object */
2071 LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2072 s->inotify.inode_data = inode_data;
2073
2074 /* Actually realize the watch now */
2075 r = inode_data_realize_watch(e, inode_data);
2076 if (r < 0)
2077 return r;
2078
2079 if (ret)
2080 *ret = s;
2081 TAKE_PTR(s);
2082
2083 return 0;
2084 }
2085
sd_event_add_inotify_fd(sd_event * e,sd_event_source ** ret,int fd,uint32_t mask,sd_event_inotify_handler_t callback,void * userdata)2086 _public_ int sd_event_add_inotify_fd(
2087 sd_event *e,
2088 sd_event_source **ret,
2089 int fd,
2090 uint32_t mask,
2091 sd_event_inotify_handler_t callback,
2092 void *userdata) {
2093
2094 return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2095 }
2096
sd_event_add_inotify(sd_event * e,sd_event_source ** ret,const char * path,uint32_t mask,sd_event_inotify_handler_t callback,void * userdata)2097 _public_ int sd_event_add_inotify(
2098 sd_event *e,
2099 sd_event_source **ret,
2100 const char *path,
2101 uint32_t mask,
2102 sd_event_inotify_handler_t callback,
2103 void *userdata) {
2104
2105 sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2106 int fd, r;
2107
2108 assert_return(path, -EINVAL);
2109
2110 fd = open(path, O_PATH|O_CLOEXEC|
2111 (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
2112 (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2113 if (fd < 0)
2114 return -errno;
2115
2116 r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2117 if (r < 0)
2118 return r;
2119
2120 (void) sd_event_source_set_description(s, path);
2121
2122 if (ret)
2123 *ret = s;
2124
2125 return r;
2126 }
2127
event_source_free(sd_event_source * s)2128 static sd_event_source* event_source_free(sd_event_source *s) {
2129 if (!s)
2130 return NULL;
2131
2132 /* Here's a special hack: when we are called from a
2133 * dispatch handler we won't free the event source
2134 * immediately, but we will detach the fd from the
2135 * epoll. This way it is safe for the caller to unref
2136 * the event source and immediately close the fd, but
2137 * we still retain a valid event source object after
2138 * the callback. */
2139
2140 if (s->dispatching) {
2141 if (s->type == SOURCE_IO)
2142 source_io_unregister(s);
2143
2144 source_disconnect(s);
2145 } else
2146 source_free(s);
2147
2148 return NULL;
2149 }
2150
2151 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2152
sd_event_source_set_description(sd_event_source * s,const char * description)2153 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2154 assert_return(s, -EINVAL);
2155 assert_return(!event_pid_changed(s->event), -ECHILD);
2156
2157 return free_and_strdup(&s->description, description);
2158 }
2159
sd_event_source_get_description(sd_event_source * s,const char ** description)2160 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2161 assert_return(s, -EINVAL);
2162 assert_return(description, -EINVAL);
2163 assert_return(!event_pid_changed(s->event), -ECHILD);
2164
2165 if (!s->description)
2166 return -ENXIO;
2167
2168 *description = s->description;
2169 return 0;
2170 }
2171
sd_event_source_get_event(sd_event_source * s)2172 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2173 assert_return(s, NULL);
2174
2175 return s->event;
2176 }
2177
sd_event_source_get_pending(sd_event_source * s)2178 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2179 assert_return(s, -EINVAL);
2180 assert_return(s->type != SOURCE_EXIT, -EDOM);
2181 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2182 assert_return(!event_pid_changed(s->event), -ECHILD);
2183
2184 return s->pending;
2185 }
2186
sd_event_source_get_io_fd(sd_event_source * s)2187 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2188 assert_return(s, -EINVAL);
2189 assert_return(s->type == SOURCE_IO, -EDOM);
2190 assert_return(!event_pid_changed(s->event), -ECHILD);
2191
2192 return s->io.fd;
2193 }
2194
sd_event_source_set_io_fd(sd_event_source * s,int fd)2195 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2196 int r;
2197
2198 assert_return(s, -EINVAL);
2199 assert_return(fd >= 0, -EBADF);
2200 assert_return(s->type == SOURCE_IO, -EDOM);
2201 assert_return(!event_pid_changed(s->event), -ECHILD);
2202
2203 if (s->io.fd == fd)
2204 return 0;
2205
2206 if (event_source_is_offline(s)) {
2207 s->io.fd = fd;
2208 s->io.registered = false;
2209 } else {
2210 int saved_fd;
2211
2212 saved_fd = s->io.fd;
2213 assert(s->io.registered);
2214
2215 s->io.fd = fd;
2216 s->io.registered = false;
2217
2218 r = source_io_register(s, s->enabled, s->io.events);
2219 if (r < 0) {
2220 s->io.fd = saved_fd;
2221 s->io.registered = true;
2222 return r;
2223 }
2224
2225 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2226 }
2227
2228 return 0;
2229 }
2230
sd_event_source_get_io_fd_own(sd_event_source * s)2231 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2232 assert_return(s, -EINVAL);
2233 assert_return(s->type == SOURCE_IO, -EDOM);
2234
2235 return s->io.owned;
2236 }
2237
sd_event_source_set_io_fd_own(sd_event_source * s,int own)2238 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2239 assert_return(s, -EINVAL);
2240 assert_return(s->type == SOURCE_IO, -EDOM);
2241
2242 s->io.owned = own;
2243 return 0;
2244 }
2245
sd_event_source_get_io_events(sd_event_source * s,uint32_t * events)2246 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2247 assert_return(s, -EINVAL);
2248 assert_return(events, -EINVAL);
2249 assert_return(s->type == SOURCE_IO, -EDOM);
2250 assert_return(!event_pid_changed(s->event), -ECHILD);
2251
2252 *events = s->io.events;
2253 return 0;
2254 }
2255
sd_event_source_set_io_events(sd_event_source * s,uint32_t events)2256 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2257 int r;
2258
2259 assert_return(s, -EINVAL);
2260 assert_return(s->type == SOURCE_IO, -EDOM);
2261 assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2262 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2263 assert_return(!event_pid_changed(s->event), -ECHILD);
2264
2265 /* edge-triggered updates are never skipped, so we can reset edges */
2266 if (s->io.events == events && !(events & EPOLLET))
2267 return 0;
2268
2269 r = source_set_pending(s, false);
2270 if (r < 0)
2271 return r;
2272
2273 if (event_source_is_online(s)) {
2274 r = source_io_register(s, s->enabled, events);
2275 if (r < 0)
2276 return r;
2277 }
2278
2279 s->io.events = events;
2280
2281 return 0;
2282 }
2283
sd_event_source_get_io_revents(sd_event_source * s,uint32_t * revents)2284 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2285 assert_return(s, -EINVAL);
2286 assert_return(revents, -EINVAL);
2287 assert_return(s->type == SOURCE_IO, -EDOM);
2288 assert_return(s->pending, -ENODATA);
2289 assert_return(!event_pid_changed(s->event), -ECHILD);
2290
2291 *revents = s->io.revents;
2292 return 0;
2293 }
2294
sd_event_source_get_signal(sd_event_source * s)2295 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2296 assert_return(s, -EINVAL);
2297 assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2298 assert_return(!event_pid_changed(s->event), -ECHILD);
2299
2300 return s->signal.sig;
2301 }
2302
sd_event_source_get_priority(sd_event_source * s,int64_t * priority)2303 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2304 assert_return(s, -EINVAL);
2305 assert_return(!event_pid_changed(s->event), -ECHILD);
2306
2307 *priority = s->priority;
2308 return 0;
2309 }
2310
sd_event_source_set_priority(sd_event_source * s,int64_t priority)2311 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2312 bool rm_inotify = false, rm_inode = false;
2313 struct inotify_data *new_inotify_data = NULL;
2314 struct inode_data *new_inode_data = NULL;
2315 int r;
2316
2317 assert_return(s, -EINVAL);
2318 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2319 assert_return(!event_pid_changed(s->event), -ECHILD);
2320
2321 if (s->priority == priority)
2322 return 0;
2323
2324 if (s->type == SOURCE_INOTIFY) {
2325 struct inode_data *old_inode_data;
2326
2327 assert(s->inotify.inode_data);
2328 old_inode_data = s->inotify.inode_data;
2329
2330 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2331 * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2332 * events we allow priority changes only until the first following iteration. */
2333 if (old_inode_data->fd < 0)
2334 return -EOPNOTSUPP;
2335
2336 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2337 if (r < 0)
2338 return r;
2339 rm_inotify = r > 0;
2340
2341 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2342 if (r < 0)
2343 goto fail;
2344 rm_inode = r > 0;
2345
2346 if (new_inode_data->fd < 0) {
2347 /* Duplicate the fd for the new inode object if we don't have any yet */
2348 new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2349 if (new_inode_data->fd < 0) {
2350 r = -errno;
2351 goto fail;
2352 }
2353
2354 LIST_PREPEND(to_close, s->event->inode_data_to_close, new_inode_data);
2355 }
2356
2357 /* Move the event source to the new inode data structure */
2358 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2359 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2360 s->inotify.inode_data = new_inode_data;
2361
2362 /* Now create the new watch */
2363 r = inode_data_realize_watch(s->event, new_inode_data);
2364 if (r < 0) {
2365 /* Move it back */
2366 LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2367 LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2368 s->inotify.inode_data = old_inode_data;
2369 goto fail;
2370 }
2371
2372 s->priority = priority;
2373
2374 event_gc_inode_data(s->event, old_inode_data);
2375
2376 } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2377 struct signal_data *old, *d;
2378
2379 /* Move us from the signalfd belonging to the old
2380 * priority to the signalfd of the new priority */
2381
2382 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2383
2384 s->priority = priority;
2385
2386 r = event_make_signal_data(s->event, s->signal.sig, &d);
2387 if (r < 0) {
2388 s->priority = old->priority;
2389 return r;
2390 }
2391
2392 event_unmask_signal_data(s->event, old, s->signal.sig);
2393 } else
2394 s->priority = priority;
2395
2396 event_source_pp_prioq_reshuffle(s);
2397
2398 if (s->type == SOURCE_EXIT)
2399 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2400
2401 return 0;
2402
2403 fail:
2404 if (rm_inode)
2405 event_free_inode_data(s->event, new_inode_data);
2406
2407 if (rm_inotify)
2408 event_free_inotify_data(s->event, new_inotify_data);
2409
2410 return r;
2411 }
2412
sd_event_source_get_enabled(sd_event_source * s,int * ret)2413 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2414 assert_return(s, -EINVAL);
2415 assert_return(!event_pid_changed(s->event), -ECHILD);
2416
2417 if (ret)
2418 *ret = s->enabled;
2419
2420 return s->enabled != SD_EVENT_OFF;
2421 }
2422
event_source_offline(sd_event_source * s,int enabled,bool ratelimited)2423 static int event_source_offline(
2424 sd_event_source *s,
2425 int enabled,
2426 bool ratelimited) {
2427
2428 bool was_offline;
2429 int r;
2430
2431 assert(s);
2432 assert(enabled == SD_EVENT_OFF || ratelimited);
2433
2434 /* Unset the pending flag when this event source is disabled */
2435 if (s->enabled != SD_EVENT_OFF &&
2436 enabled == SD_EVENT_OFF &&
2437 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2438 r = source_set_pending(s, false);
2439 if (r < 0)
2440 return r;
2441 }
2442
2443 was_offline = event_source_is_offline(s);
2444 s->enabled = enabled;
2445 s->ratelimited = ratelimited;
2446
2447 switch (s->type) {
2448
2449 case SOURCE_IO:
2450 source_io_unregister(s);
2451 break;
2452
2453 case SOURCE_SIGNAL:
2454 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2455 break;
2456
2457 case SOURCE_CHILD:
2458 if (!was_offline) {
2459 assert(s->event->n_online_child_sources > 0);
2460 s->event->n_online_child_sources--;
2461 }
2462
2463 if (EVENT_SOURCE_WATCH_PIDFD(s))
2464 source_child_pidfd_unregister(s);
2465 else
2466 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2467 break;
2468
2469 case SOURCE_EXIT:
2470 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2471 break;
2472
2473 case SOURCE_TIME_REALTIME:
2474 case SOURCE_TIME_BOOTTIME:
2475 case SOURCE_TIME_MONOTONIC:
2476 case SOURCE_TIME_REALTIME_ALARM:
2477 case SOURCE_TIME_BOOTTIME_ALARM:
2478 case SOURCE_DEFER:
2479 case SOURCE_POST:
2480 case SOURCE_INOTIFY:
2481 break;
2482
2483 default:
2484 assert_not_reached();
2485 }
2486
2487 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2488 event_source_time_prioq_reshuffle(s);
2489
2490 return 1;
2491 }
2492
event_source_online(sd_event_source * s,int enabled,bool ratelimited)2493 static int event_source_online(
2494 sd_event_source *s,
2495 int enabled,
2496 bool ratelimited) {
2497
2498 bool was_online;
2499 int r;
2500
2501 assert(s);
2502 assert(enabled != SD_EVENT_OFF || !ratelimited);
2503
2504 /* Unset the pending flag when this event source is enabled */
2505 if (s->enabled == SD_EVENT_OFF &&
2506 enabled != SD_EVENT_OFF &&
2507 !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2508 r = source_set_pending(s, false);
2509 if (r < 0)
2510 return r;
2511 }
2512
2513 /* Are we really ready for onlining? */
2514 if (enabled == SD_EVENT_OFF || ratelimited) {
2515 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2516 s->enabled = enabled;
2517 s->ratelimited = ratelimited;
2518 return 0;
2519 }
2520
2521 was_online = event_source_is_online(s);
2522
2523 switch (s->type) {
2524 case SOURCE_IO:
2525 r = source_io_register(s, enabled, s->io.events);
2526 if (r < 0)
2527 return r;
2528 break;
2529
2530 case SOURCE_SIGNAL:
2531 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2532 if (r < 0) {
2533 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2534 return r;
2535 }
2536
2537 break;
2538
2539 case SOURCE_CHILD:
2540 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2541 /* yes, we have pidfd */
2542
2543 r = source_child_pidfd_register(s, enabled);
2544 if (r < 0)
2545 return r;
2546 } else {
2547 /* no pidfd, or something other to watch for than WEXITED */
2548
2549 r = event_make_signal_data(s->event, SIGCHLD, NULL);
2550 if (r < 0) {
2551 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2552 return r;
2553 }
2554 }
2555
2556 if (!was_online)
2557 s->event->n_online_child_sources++;
2558 break;
2559
2560 case SOURCE_TIME_REALTIME:
2561 case SOURCE_TIME_BOOTTIME:
2562 case SOURCE_TIME_MONOTONIC:
2563 case SOURCE_TIME_REALTIME_ALARM:
2564 case SOURCE_TIME_BOOTTIME_ALARM:
2565 case SOURCE_EXIT:
2566 case SOURCE_DEFER:
2567 case SOURCE_POST:
2568 case SOURCE_INOTIFY:
2569 break;
2570
2571 default:
2572 assert_not_reached();
2573 }
2574
2575 s->enabled = enabled;
2576 s->ratelimited = ratelimited;
2577
2578 /* Non-failing operations below */
2579 if (s->type == SOURCE_EXIT)
2580 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2581
2582 /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2583 event_source_time_prioq_reshuffle(s);
2584
2585 return 1;
2586 }
2587
sd_event_source_set_enabled(sd_event_source * s,int m)2588 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2589 int r;
2590
2591 assert_return(s, -EINVAL);
2592 assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
2593 assert_return(!event_pid_changed(s->event), -ECHILD);
2594
2595 /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2596 if (s->event->state == SD_EVENT_FINISHED)
2597 return m == SD_EVENT_OFF ? 0 : -ESTALE;
2598
2599 if (s->enabled == m) /* No change? */
2600 return 0;
2601
2602 if (m == SD_EVENT_OFF)
2603 r = event_source_offline(s, m, s->ratelimited);
2604 else {
2605 if (s->enabled != SD_EVENT_OFF) {
2606 /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2607 * event source is already enabled after all. */
2608 s->enabled = m;
2609 return 0;
2610 }
2611
2612 r = event_source_online(s, m, s->ratelimited);
2613 }
2614 if (r < 0)
2615 return r;
2616
2617 event_source_pp_prioq_reshuffle(s);
2618 return 0;
2619 }
2620
sd_event_source_get_time(sd_event_source * s,uint64_t * usec)2621 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
2622 assert_return(s, -EINVAL);
2623 assert_return(usec, -EINVAL);
2624 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2625 assert_return(!event_pid_changed(s->event), -ECHILD);
2626
2627 *usec = s->time.next;
2628 return 0;
2629 }
2630
sd_event_source_set_time(sd_event_source * s,uint64_t usec)2631 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2632 int r;
2633
2634 assert_return(s, -EINVAL);
2635 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2636 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2637 assert_return(!event_pid_changed(s->event), -ECHILD);
2638
2639 r = source_set_pending(s, false);
2640 if (r < 0)
2641 return r;
2642
2643 s->time.next = usec;
2644
2645 event_source_time_prioq_reshuffle(s);
2646 return 0;
2647 }
2648
sd_event_source_set_time_relative(sd_event_source * s,uint64_t usec)2649 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
2650 usec_t t;
2651 int r;
2652
2653 assert_return(s, -EINVAL);
2654 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2655
2656 r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
2657 if (r < 0)
2658 return r;
2659
2660 usec = usec_add(t, usec);
2661 if (usec == USEC_INFINITY)
2662 return -EOVERFLOW;
2663
2664 return sd_event_source_set_time(s, usec);
2665 }
2666
sd_event_source_get_time_accuracy(sd_event_source * s,uint64_t * usec)2667 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
2668 assert_return(s, -EINVAL);
2669 assert_return(usec, -EINVAL);
2670 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2671 assert_return(!event_pid_changed(s->event), -ECHILD);
2672
2673 *usec = s->time.accuracy;
2674 return 0;
2675 }
2676
sd_event_source_set_time_accuracy(sd_event_source * s,uint64_t usec)2677 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2678 int r;
2679
2680 assert_return(s, -EINVAL);
2681 assert_return(usec != UINT64_MAX, -EINVAL);
2682 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2683 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2684 assert_return(!event_pid_changed(s->event), -ECHILD);
2685
2686 r = source_set_pending(s, false);
2687 if (r < 0)
2688 return r;
2689
2690 if (usec == 0)
2691 usec = DEFAULT_ACCURACY_USEC;
2692
2693 s->time.accuracy = usec;
2694
2695 event_source_time_prioq_reshuffle(s);
2696 return 0;
2697 }
2698
sd_event_source_get_time_clock(sd_event_source * s,clockid_t * clock)2699 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2700 assert_return(s, -EINVAL);
2701 assert_return(clock, -EINVAL);
2702 assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2703 assert_return(!event_pid_changed(s->event), -ECHILD);
2704
2705 *clock = event_source_type_to_clock(s->type);
2706 return 0;
2707 }
2708
sd_event_source_get_child_pid(sd_event_source * s,pid_t * pid)2709 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
2710 assert_return(s, -EINVAL);
2711 assert_return(pid, -EINVAL);
2712 assert_return(s->type == SOURCE_CHILD, -EDOM);
2713 assert_return(!event_pid_changed(s->event), -ECHILD);
2714
2715 *pid = s->child.pid;
2716 return 0;
2717 }
2718
sd_event_source_get_child_pidfd(sd_event_source * s)2719 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
2720 assert_return(s, -EINVAL);
2721 assert_return(s->type == SOURCE_CHILD, -EDOM);
2722 assert_return(!event_pid_changed(s->event), -ECHILD);
2723
2724 if (s->child.pidfd < 0)
2725 return -EOPNOTSUPP;
2726
2727 return s->child.pidfd;
2728 }
2729
sd_event_source_send_child_signal(sd_event_source * s,int sig,const siginfo_t * si,unsigned flags)2730 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
2731 assert_return(s, -EINVAL);
2732 assert_return(s->type == SOURCE_CHILD, -EDOM);
2733 assert_return(!event_pid_changed(s->event), -ECHILD);
2734 assert_return(SIGNAL_VALID(sig), -EINVAL);
2735
2736 /* If we already have seen indication the process exited refuse sending a signal early. This way we
2737 * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2738 * available. */
2739 if (s->child.exited)
2740 return -ESRCH;
2741
2742 if (s->child.pidfd >= 0) {
2743 siginfo_t copy;
2744
2745 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2746 * structure here */
2747 if (si)
2748 copy = *si;
2749
2750 if (pidfd_send_signal(s->child.pidfd, sig, si ? © : NULL, 0) < 0) {
2751 /* Let's propagate the error only if the system call is not implemented or prohibited */
2752 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
2753 return -errno;
2754 } else
2755 return 0;
2756 }
2757
2758 /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2759 * this here. */
2760 if (flags != 0)
2761 return -EOPNOTSUPP;
2762
2763 if (si) {
2764 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2765 siginfo_t copy = *si;
2766
2767 if (rt_sigqueueinfo(s->child.pid, sig, ©) < 0)
2768 return -errno;
2769 } else if (kill(s->child.pid, sig) < 0)
2770 return -errno;
2771
2772 return 0;
2773 }
2774
sd_event_source_get_child_pidfd_own(sd_event_source * s)2775 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
2776 assert_return(s, -EINVAL);
2777 assert_return(s->type == SOURCE_CHILD, -EDOM);
2778
2779 if (s->child.pidfd < 0)
2780 return -EOPNOTSUPP;
2781
2782 return s->child.pidfd_owned;
2783 }
2784
sd_event_source_set_child_pidfd_own(sd_event_source * s,int own)2785 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
2786 assert_return(s, -EINVAL);
2787 assert_return(s->type == SOURCE_CHILD, -EDOM);
2788
2789 if (s->child.pidfd < 0)
2790 return -EOPNOTSUPP;
2791
2792 s->child.pidfd_owned = own;
2793 return 0;
2794 }
2795
sd_event_source_get_child_process_own(sd_event_source * s)2796 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
2797 assert_return(s, -EINVAL);
2798 assert_return(s->type == SOURCE_CHILD, -EDOM);
2799
2800 return s->child.process_owned;
2801 }
2802
sd_event_source_set_child_process_own(sd_event_source * s,int own)2803 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
2804 assert_return(s, -EINVAL);
2805 assert_return(s->type == SOURCE_CHILD, -EDOM);
2806
2807 s->child.process_owned = own;
2808 return 0;
2809 }
2810
sd_event_source_get_inotify_mask(sd_event_source * s,uint32_t * mask)2811 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2812 assert_return(s, -EINVAL);
2813 assert_return(mask, -EINVAL);
2814 assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2815 assert_return(!event_pid_changed(s->event), -ECHILD);
2816
2817 *mask = s->inotify.mask;
2818 return 0;
2819 }
2820
sd_event_source_set_prepare(sd_event_source * s,sd_event_handler_t callback)2821 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
2822 int r;
2823
2824 assert_return(s, -EINVAL);
2825 assert_return(s->type != SOURCE_EXIT, -EDOM);
2826 assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2827 assert_return(!event_pid_changed(s->event), -ECHILD);
2828
2829 if (s->prepare == callback)
2830 return 0;
2831
2832 if (callback && s->prepare) {
2833 s->prepare = callback;
2834 return 0;
2835 }
2836
2837 r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2838 if (r < 0)
2839 return r;
2840
2841 s->prepare = callback;
2842
2843 if (callback) {
2844 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2845 if (r < 0)
2846 return r;
2847 } else
2848 prioq_remove(s->event->prepare, s, &s->prepare_index);
2849
2850 return 0;
2851 }
2852
sd_event_source_get_userdata(sd_event_source * s)2853 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
2854 assert_return(s, NULL);
2855
2856 return s->userdata;
2857 }
2858
sd_event_source_set_userdata(sd_event_source * s,void * userdata)2859 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2860 void *ret;
2861
2862 assert_return(s, NULL);
2863
2864 ret = s->userdata;
2865 s->userdata = userdata;
2866
2867 return ret;
2868 }
2869
event_source_enter_ratelimited(sd_event_source * s)2870 static int event_source_enter_ratelimited(sd_event_source *s) {
2871 int r;
2872
2873 assert(s);
2874
2875 /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2876 * the end of the rate limit time window, much as if it was a timer event source. */
2877
2878 if (s->ratelimited)
2879 return 0; /* Already ratelimited, this is a NOP hence */
2880
2881 /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2882 r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
2883 if (r < 0)
2884 return r;
2885
2886 /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2887 * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2888 * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2889 if (EVENT_SOURCE_IS_TIME(s->type))
2890 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2891
2892 /* Now, let's add the event source to the monotonic clock instead */
2893 r = event_source_time_prioq_put(s, &s->event->monotonic);
2894 if (r < 0)
2895 goto fail;
2896
2897 /* And let's take the event source officially offline */
2898 r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
2899 if (r < 0) {
2900 event_source_time_prioq_remove(s, &s->event->monotonic);
2901 goto fail;
2902 }
2903
2904 event_source_pp_prioq_reshuffle(s);
2905
2906 log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
2907 return 0;
2908
2909 fail:
2910 /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
2911 * space for it should already be allocated. */
2912 if (EVENT_SOURCE_IS_TIME(s->type))
2913 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
2914
2915 return r;
2916 }
2917
event_source_leave_ratelimit(sd_event_source * s,bool run_callback)2918 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
2919 int r;
2920
2921 assert(s);
2922
2923 if (!s->ratelimited)
2924 return 0;
2925
2926 /* Let's take the event source out of the monotonic prioq first. */
2927 event_source_time_prioq_remove(s, &s->event->monotonic);
2928
2929 /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
2930 if (EVENT_SOURCE_IS_TIME(s->type)) {
2931 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
2932 if (r < 0)
2933 goto fail;
2934 }
2935
2936 /* Let's try to take it online again. */
2937 r = event_source_online(s, s->enabled, /* ratelimited= */ false);
2938 if (r < 0) {
2939 /* Do something roughly sensible when this failed: undo the two prioq ops above */
2940 if (EVENT_SOURCE_IS_TIME(s->type))
2941 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2942
2943 goto fail;
2944 }
2945
2946 event_source_pp_prioq_reshuffle(s);
2947 ratelimit_reset(&s->rate_limit);
2948
2949 log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
2950
2951 if (run_callback && s->ratelimit_expire_callback) {
2952 s->dispatching = true;
2953 r = s->ratelimit_expire_callback(s, s->userdata);
2954 s->dispatching = false;
2955
2956 if (r < 0) {
2957 log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
2958 strna(s->description),
2959 event_source_type_to_string(s->type),
2960 s->exit_on_failure ? "exiting" : "disabling");
2961
2962 if (s->exit_on_failure)
2963 (void) sd_event_exit(s->event, r);
2964 }
2965
2966 if (s->n_ref == 0)
2967 source_free(s);
2968 else if (r < 0)
2969 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
2970
2971 return 1;
2972 }
2973
2974 return 0;
2975
2976 fail:
2977 /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
2978 * simply put it back in it, maybe we can then process it more successfully next iteration. */
2979 assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
2980
2981 return r;
2982 }
2983
sleep_between(sd_event * e,usec_t a,usec_t b)2984 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
2985 usec_t c;
2986 assert(e);
2987 assert(a <= b);
2988
2989 if (a <= 0)
2990 return 0;
2991 if (a >= USEC_INFINITY)
2992 return USEC_INFINITY;
2993
2994 if (b <= a + 1)
2995 return a;
2996
2997 initialize_perturb(e);
2998
2999 /*
3000 Find a good time to wake up again between times a and b. We
3001 have two goals here:
3002
3003 a) We want to wake up as seldom as possible, hence prefer
3004 later times over earlier times.
3005
3006 b) But if we have to wake up, then let's make sure to
3007 dispatch as much as possible on the entire system.
3008
3009 We implement this by waking up everywhere at the same time
3010 within any given minute if we can, synchronised via the
3011 perturbation value determined from the boot ID. If we can't,
3012 then we try to find the same spot in every 10s, then 1s and
3013 then 250ms step. Otherwise, we pick the last possible time
3014 to wake up.
3015 */
3016
3017 c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3018 if (c >= b) {
3019 if (_unlikely_(c < USEC_PER_MINUTE))
3020 return b;
3021
3022 c -= USEC_PER_MINUTE;
3023 }
3024
3025 if (c >= a)
3026 return c;
3027
3028 c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3029 if (c >= b) {
3030 if (_unlikely_(c < USEC_PER_SEC*10))
3031 return b;
3032
3033 c -= USEC_PER_SEC*10;
3034 }
3035
3036 if (c >= a)
3037 return c;
3038
3039 c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3040 if (c >= b) {
3041 if (_unlikely_(c < USEC_PER_SEC))
3042 return b;
3043
3044 c -= USEC_PER_SEC;
3045 }
3046
3047 if (c >= a)
3048 return c;
3049
3050 c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3051 if (c >= b) {
3052 if (_unlikely_(c < USEC_PER_MSEC*250))
3053 return b;
3054
3055 c -= USEC_PER_MSEC*250;
3056 }
3057
3058 if (c >= a)
3059 return c;
3060
3061 return b;
3062 }
3063
event_arm_timer(sd_event * e,struct clock_data * d)3064 static int event_arm_timer(
3065 sd_event *e,
3066 struct clock_data *d) {
3067
3068 struct itimerspec its = {};
3069 sd_event_source *a, *b;
3070 usec_t t;
3071
3072 assert(e);
3073 assert(d);
3074
3075 if (!d->needs_rearm)
3076 return 0;
3077
3078 d->needs_rearm = false;
3079
3080 a = prioq_peek(d->earliest);
3081 assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3082 if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3083
3084 if (d->fd < 0)
3085 return 0;
3086
3087 if (d->next == USEC_INFINITY)
3088 return 0;
3089
3090 /* disarm */
3091 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3092 return -errno;
3093
3094 d->next = USEC_INFINITY;
3095 return 0;
3096 }
3097
3098 b = prioq_peek(d->latest);
3099 assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3100 assert(b && b->enabled != SD_EVENT_OFF);
3101
3102 t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3103 if (d->next == t)
3104 return 0;
3105
3106 assert_se(d->fd >= 0);
3107
3108 if (t == 0) {
3109 /* We don' want to disarm here, just mean some time looooong ago. */
3110 its.it_value.tv_sec = 0;
3111 its.it_value.tv_nsec = 1;
3112 } else
3113 timespec_store(&its.it_value, t);
3114
3115 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3116 return -errno;
3117
3118 d->next = t;
3119 return 0;
3120 }
3121
process_io(sd_event * e,sd_event_source * s,uint32_t revents)3122 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3123 assert(e);
3124 assert(s);
3125 assert(s->type == SOURCE_IO);
3126
3127 /* If the event source was already pending, we just OR in the
3128 * new revents, otherwise we reset the value. The ORing is
3129 * necessary to handle EPOLLONESHOT events properly where
3130 * readability might happen independently of writability, and
3131 * we need to keep track of both */
3132
3133 if (s->pending)
3134 s->io.revents |= revents;
3135 else
3136 s->io.revents = revents;
3137
3138 return source_set_pending(s, true);
3139 }
3140
flush_timer(sd_event * e,int fd,uint32_t events,usec_t * next)3141 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3142 uint64_t x;
3143 ssize_t ss;
3144
3145 assert(e);
3146 assert(fd >= 0);
3147
3148 assert_return(events == EPOLLIN, -EIO);
3149
3150 ss = read(fd, &x, sizeof(x));
3151 if (ss < 0) {
3152 if (ERRNO_IS_TRANSIENT(errno))
3153 return 0;
3154
3155 return -errno;
3156 }
3157
3158 if (_unlikely_(ss != sizeof(x)))
3159 return -EIO;
3160
3161 if (next)
3162 *next = USEC_INFINITY;
3163
3164 return 0;
3165 }
3166
process_timer(sd_event * e,usec_t n,struct clock_data * d)3167 static int process_timer(
3168 sd_event *e,
3169 usec_t n,
3170 struct clock_data *d) {
3171
3172 sd_event_source *s;
3173 bool callback_invoked = false;
3174 int r;
3175
3176 assert(e);
3177 assert(d);
3178
3179 for (;;) {
3180 s = prioq_peek(d->earliest);
3181 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3182
3183 if (!s || time_event_source_next(s) > n)
3184 break;
3185
3186 if (s->ratelimited) {
3187 /* This is an event sources whose ratelimit window has ended. Let's turn it on
3188 * again. */
3189 assert(s->ratelimited);
3190
3191 r = event_source_leave_ratelimit(s, /* run_callback */ true);
3192 if (r < 0)
3193 return r;
3194 else if (r == 1)
3195 callback_invoked = true;
3196
3197 continue;
3198 }
3199
3200 if (s->enabled == SD_EVENT_OFF || s->pending)
3201 break;
3202
3203 r = source_set_pending(s, true);
3204 if (r < 0)
3205 return r;
3206
3207 event_source_time_prioq_reshuffle(s);
3208 }
3209
3210 return callback_invoked;
3211 }
3212
process_child(sd_event * e,int64_t threshold,int64_t * ret_min_priority)3213 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3214 int64_t min_priority = threshold;
3215 bool something_new = false;
3216 sd_event_source *s;
3217 int r;
3218
3219 assert(e);
3220 assert(ret_min_priority);
3221
3222 if (!e->need_process_child) {
3223 *ret_min_priority = min_priority;
3224 return 0;
3225 }
3226
3227 e->need_process_child = false;
3228
3229 /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3230 * for, instead of using P_ALL. This is because we only want to get child information of very
3231 * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3232 * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3233 * hence we really don't want anything flushed out of the kernel's queue that we don't care
3234 * about. Since this is O(n) this means that if you have a lot of processes you probably want
3235 * to handle SIGCHLD yourself.
3236 *
3237 * We do not reap the children here (by using WNOWAIT), this is only done after the event
3238 * source is dispatched so that the callback still sees the process as a zombie. */
3239
3240 HASHMAP_FOREACH(s, e->child_sources) {
3241 assert(s->type == SOURCE_CHILD);
3242
3243 if (s->priority > threshold)
3244 continue;
3245
3246 if (s->pending)
3247 continue;
3248
3249 if (event_source_is_offline(s))
3250 continue;
3251
3252 if (s->child.exited)
3253 continue;
3254
3255 if (EVENT_SOURCE_WATCH_PIDFD(s))
3256 /* There's a usable pidfd known for this event source? Then don't waitid() for
3257 * it here */
3258 continue;
3259
3260 zero(s->child.siginfo);
3261 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3262 WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3263 return negative_errno();
3264
3265 if (s->child.siginfo.si_pid != 0) {
3266 bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3267
3268 if (zombie)
3269 s->child.exited = true;
3270
3271 if (!zombie && (s->child.options & WEXITED)) {
3272 /* If the child isn't dead then let's immediately remove the state
3273 * change from the queue, since there's no benefit in leaving it
3274 * queued. */
3275
3276 assert(s->child.options & (WSTOPPED|WCONTINUED));
3277 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3278 }
3279
3280 r = source_set_pending(s, true);
3281 if (r < 0)
3282 return r;
3283 if (r > 0) {
3284 something_new = true;
3285 min_priority = MIN(min_priority, s->priority);
3286 }
3287 }
3288 }
3289
3290 *ret_min_priority = min_priority;
3291 return something_new;
3292 }
3293
process_pidfd(sd_event * e,sd_event_source * s,uint32_t revents)3294 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3295 assert(e);
3296 assert(s);
3297 assert(s->type == SOURCE_CHILD);
3298
3299 if (s->pending)
3300 return 0;
3301
3302 if (event_source_is_offline(s))
3303 return 0;
3304
3305 if (!EVENT_SOURCE_WATCH_PIDFD(s))
3306 return 0;
3307
3308 zero(s->child.siginfo);
3309 if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3310 return -errno;
3311
3312 if (s->child.siginfo.si_pid == 0)
3313 return 0;
3314
3315 if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3316 s->child.exited = true;
3317
3318 return source_set_pending(s, true);
3319 }
3320
process_signal(sd_event * e,struct signal_data * d,uint32_t events,int64_t * min_priority)3321 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3322 int r;
3323
3324 assert(e);
3325 assert(d);
3326 assert_return(events == EPOLLIN, -EIO);
3327 assert(min_priority);
3328
3329 /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3330 * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3331 * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3332 * but we might have higher priority children we care about hence we need to check that
3333 * explicitly. */
3334
3335 if (sigismember(&d->sigset, SIGCHLD))
3336 e->need_process_child = true;
3337
3338 /* If there's already an event source pending for this priority we don't read another */
3339 if (d->current)
3340 return 0;
3341
3342 for (;;) {
3343 struct signalfd_siginfo si;
3344 ssize_t n;
3345 sd_event_source *s = NULL;
3346
3347 n = read(d->fd, &si, sizeof(si));
3348 if (n < 0) {
3349 if (ERRNO_IS_TRANSIENT(errno))
3350 return 0;
3351
3352 return -errno;
3353 }
3354
3355 if (_unlikely_(n != sizeof(si)))
3356 return -EIO;
3357
3358 assert(SIGNAL_VALID(si.ssi_signo));
3359
3360 if (e->signal_sources)
3361 s = e->signal_sources[si.ssi_signo];
3362 if (!s)
3363 continue;
3364 if (s->pending)
3365 continue;
3366
3367 s->signal.siginfo = si;
3368 d->current = s;
3369
3370 r = source_set_pending(s, true);
3371 if (r < 0)
3372 return r;
3373 if (r > 0 && *min_priority >= s->priority) {
3374 *min_priority = s->priority;
3375 return 1; /* an event source with smaller priority is queued. */
3376 }
3377
3378 return 0;
3379 }
3380 }
3381
event_inotify_data_read(sd_event * e,struct inotify_data * d,uint32_t revents,int64_t threshold)3382 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3383 ssize_t n;
3384
3385 assert(e);
3386 assert(d);
3387
3388 assert_return(revents == EPOLLIN, -EIO);
3389
3390 /* If there's already an event source pending for this priority, don't read another */
3391 if (d->n_pending > 0)
3392 return 0;
3393
3394 /* Is the read buffer non-empty? If so, let's not read more */
3395 if (d->buffer_filled > 0)
3396 return 0;
3397
3398 if (d->priority > threshold)
3399 return 0;
3400
3401 n = read(d->fd, &d->buffer, sizeof(d->buffer));
3402 if (n < 0) {
3403 if (ERRNO_IS_TRANSIENT(errno))
3404 return 0;
3405
3406 return -errno;
3407 }
3408
3409 assert(n > 0);
3410 d->buffer_filled = (size_t) n;
3411 LIST_PREPEND(buffered, e->inotify_data_buffered, d);
3412
3413 return 1;
3414 }
3415
event_inotify_data_drop(sd_event * e,struct inotify_data * d,size_t sz)3416 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3417 assert(e);
3418 assert(d);
3419 assert(sz <= d->buffer_filled);
3420
3421 if (sz == 0)
3422 return;
3423
3424 /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3425 memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3426 d->buffer_filled -= sz;
3427
3428 if (d->buffer_filled == 0)
3429 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
3430 }
3431
event_inotify_data_process(sd_event * e,struct inotify_data * d)3432 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3433 int r;
3434
3435 assert(e);
3436 assert(d);
3437
3438 /* If there's already an event source pending for this priority, don't read another */
3439 if (d->n_pending > 0)
3440 return 0;
3441
3442 while (d->buffer_filled > 0) {
3443 size_t sz;
3444
3445 /* Let's validate that the event structures are complete */
3446 if (d->buffer_filled < offsetof(struct inotify_event, name))
3447 return -EIO;
3448
3449 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3450 if (d->buffer_filled < sz)
3451 return -EIO;
3452
3453 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3454 struct inode_data *inode_data;
3455
3456 /* The queue overran, let's pass this event to all event sources connected to this inotify
3457 * object */
3458
3459 HASHMAP_FOREACH(inode_data, d->inodes)
3460 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3461
3462 if (event_source_is_offline(s))
3463 continue;
3464
3465 r = source_set_pending(s, true);
3466 if (r < 0)
3467 return r;
3468 }
3469 } else {
3470 struct inode_data *inode_data;
3471
3472 /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3473 * our watch descriptor table. */
3474 if (d->buffer.ev.mask & IN_IGNORED) {
3475
3476 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3477 if (!inode_data) {
3478 event_inotify_data_drop(e, d, sz);
3479 continue;
3480 }
3481
3482 /* The watch descriptor was removed by the kernel, let's drop it here too */
3483 inode_data->wd = -1;
3484 } else {
3485 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3486 if (!inode_data) {
3487 event_inotify_data_drop(e, d, sz);
3488 continue;
3489 }
3490 }
3491
3492 /* Trigger all event sources that are interested in these events. Also trigger all event
3493 * sources if IN_IGNORED or IN_UNMOUNT is set. */
3494 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3495
3496 if (event_source_is_offline(s))
3497 continue;
3498
3499 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3500 (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3501 continue;
3502
3503 r = source_set_pending(s, true);
3504 if (r < 0)
3505 return r;
3506 }
3507 }
3508
3509 /* Something pending now? If so, let's finish, otherwise let's read more. */
3510 if (d->n_pending > 0)
3511 return 1;
3512 }
3513
3514 return 0;
3515 }
3516
process_inotify(sd_event * e)3517 static int process_inotify(sd_event *e) {
3518 int r, done = 0;
3519
3520 assert(e);
3521
3522 LIST_FOREACH(buffered, d, e->inotify_data_buffered) {
3523 r = event_inotify_data_process(e, d);
3524 if (r < 0)
3525 return r;
3526 if (r > 0)
3527 done ++;
3528 }
3529
3530 return done;
3531 }
3532
source_dispatch(sd_event_source * s)3533 static int source_dispatch(sd_event_source *s) {
3534 _cleanup_(sd_event_unrefp) sd_event *saved_event = NULL;
3535 EventSourceType saved_type;
3536 int r = 0;
3537
3538 assert(s);
3539 assert(s->pending || s->type == SOURCE_EXIT);
3540
3541 /* Save the event source type, here, so that we still know it after the event callback which might
3542 * invalidate the event. */
3543 saved_type = s->type;
3544
3545 /* Similarly, store a reference to the event loop object, so that we can still access it after the
3546 * callback might have invalidated/disconnected the event source. */
3547 saved_event = sd_event_ref(s->event);
3548
3549 /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
3550 assert(!s->ratelimited);
3551 if (!ratelimit_below(&s->rate_limit)) {
3552 r = event_source_enter_ratelimited(s);
3553 if (r < 0)
3554 return r;
3555
3556 return 1;
3557 }
3558
3559 if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
3560 r = source_set_pending(s, false);
3561 if (r < 0)
3562 return r;
3563 }
3564
3565 if (s->type != SOURCE_POST) {
3566 sd_event_source *z;
3567
3568 /* If we execute a non-post source, let's mark all post sources as pending. */
3569
3570 SET_FOREACH(z, s->event->post_sources) {
3571 if (event_source_is_offline(z))
3572 continue;
3573
3574 r = source_set_pending(z, true);
3575 if (r < 0)
3576 return r;
3577 }
3578 }
3579
3580 if (s->enabled == SD_EVENT_ONESHOT) {
3581 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
3582 if (r < 0)
3583 return r;
3584 }
3585
3586 s->dispatching = true;
3587
3588 switch (s->type) {
3589
3590 case SOURCE_IO:
3591 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3592 break;
3593
3594 case SOURCE_TIME_REALTIME:
3595 case SOURCE_TIME_BOOTTIME:
3596 case SOURCE_TIME_MONOTONIC:
3597 case SOURCE_TIME_REALTIME_ALARM:
3598 case SOURCE_TIME_BOOTTIME_ALARM:
3599 r = s->time.callback(s, s->time.next, s->userdata);
3600 break;
3601
3602 case SOURCE_SIGNAL:
3603 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3604 break;
3605
3606 case SOURCE_CHILD: {
3607 bool zombie;
3608
3609 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3610
3611 r = s->child.callback(s, &s->child.siginfo, s->userdata);
3612
3613 /* Now, reap the PID for good. */
3614 if (zombie) {
3615 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
3616 s->child.waited = true;
3617 }
3618
3619 break;
3620 }
3621
3622 case SOURCE_DEFER:
3623 r = s->defer.callback(s, s->userdata);
3624 break;
3625
3626 case SOURCE_POST:
3627 r = s->post.callback(s, s->userdata);
3628 break;
3629
3630 case SOURCE_EXIT:
3631 r = s->exit.callback(s, s->userdata);
3632 break;
3633
3634 case SOURCE_INOTIFY: {
3635 struct sd_event *e = s->event;
3636 struct inotify_data *d;
3637 size_t sz;
3638
3639 assert(s->inotify.inode_data);
3640 assert_se(d = s->inotify.inode_data->inotify_data);
3641
3642 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3643 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3644 assert(d->buffer_filled >= sz);
3645
3646 /* If the inotify callback destroys the event source then this likely means we don't need to
3647 * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
3648 * free it immediately, then we couldn't drop the event from the inotify event queue without
3649 * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
3650 * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
3651 * explicitly GC it after we are done dropping the inotify event from the buffer. */
3652 d->n_busy++;
3653 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
3654 d->n_busy--;
3655
3656 /* When no event is pending anymore on this inotify object, then let's drop the event from
3657 * the inotify event queue buffer. */
3658 if (d->n_pending == 0)
3659 event_inotify_data_drop(e, d, sz);
3660
3661 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
3662 event_gc_inotify_data(e, d);
3663 break;
3664 }
3665
3666 case SOURCE_WATCHDOG:
3667 case _SOURCE_EVENT_SOURCE_TYPE_MAX:
3668 case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
3669 assert_not_reached();
3670 }
3671
3672 s->dispatching = false;
3673
3674 if (r < 0) {
3675 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
3676 strna(s->description),
3677 event_source_type_to_string(saved_type),
3678 s->exit_on_failure ? "exiting" : "disabling");
3679
3680 if (s->exit_on_failure)
3681 (void) sd_event_exit(saved_event, r);
3682 }
3683
3684 if (s->n_ref == 0)
3685 source_free(s);
3686 else if (r < 0)
3687 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3688
3689 return 1;
3690 }
3691
event_prepare(sd_event * e)3692 static int event_prepare(sd_event *e) {
3693 int r;
3694
3695 assert(e);
3696
3697 for (;;) {
3698 sd_event_source *s;
3699
3700 s = prioq_peek(e->prepare);
3701 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
3702 break;
3703
3704 s->prepare_iteration = e->iteration;
3705 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
3706 if (r < 0)
3707 return r;
3708
3709 assert(s->prepare);
3710
3711 s->dispatching = true;
3712 r = s->prepare(s, s->userdata);
3713 s->dispatching = false;
3714
3715 if (r < 0) {
3716 log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3717 strna(s->description),
3718 event_source_type_to_string(s->type),
3719 s->exit_on_failure ? "exiting" : "disabling");
3720
3721 if (s->exit_on_failure)
3722 (void) sd_event_exit(e, r);
3723 }
3724
3725 if (s->n_ref == 0)
3726 source_free(s);
3727 else if (r < 0)
3728 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3729 }
3730
3731 return 0;
3732 }
3733
dispatch_exit(sd_event * e)3734 static int dispatch_exit(sd_event *e) {
3735 sd_event_source *p;
3736 int r;
3737
3738 assert(e);
3739
3740 p = prioq_peek(e->exit);
3741 assert(!p || p->type == SOURCE_EXIT);
3742
3743 if (!p || event_source_is_offline(p)) {
3744 e->state = SD_EVENT_FINISHED;
3745 return 0;
3746 }
3747
3748 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
3749 e->iteration++;
3750 e->state = SD_EVENT_EXITING;
3751 r = source_dispatch(p);
3752 e->state = SD_EVENT_INITIAL;
3753 return r;
3754 }
3755
event_next_pending(sd_event * e)3756 static sd_event_source* event_next_pending(sd_event *e) {
3757 sd_event_source *p;
3758
3759 assert(e);
3760
3761 p = prioq_peek(e->pending);
3762 if (!p)
3763 return NULL;
3764
3765 if (event_source_is_offline(p))
3766 return NULL;
3767
3768 return p;
3769 }
3770
arm_watchdog(sd_event * e)3771 static int arm_watchdog(sd_event *e) {
3772 struct itimerspec its = {};
3773 usec_t t;
3774
3775 assert(e);
3776 assert(e->watchdog_fd >= 0);
3777
3778 t = sleep_between(e,
3779 usec_add(e->watchdog_last, (e->watchdog_period / 2)),
3780 usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
3781
3782 timespec_store(&its.it_value, t);
3783
3784 /* Make sure we never set the watchdog to 0, which tells the
3785 * kernel to disable it. */
3786 if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3787 its.it_value.tv_nsec = 1;
3788
3789 return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
3790 }
3791
process_watchdog(sd_event * e)3792 static int process_watchdog(sd_event *e) {
3793 assert(e);
3794
3795 if (!e->watchdog)
3796 return 0;
3797
3798 /* Don't notify watchdog too often */
3799 if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3800 return 0;
3801
3802 sd_notify(false, "WATCHDOG=1");
3803 e->watchdog_last = e->timestamp.monotonic;
3804
3805 return arm_watchdog(e);
3806 }
3807
event_close_inode_data_fds(sd_event * e)3808 static void event_close_inode_data_fds(sd_event *e) {
3809 struct inode_data *d;
3810
3811 assert(e);
3812
3813 /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3814 * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
3815 * adjustments to the even source, such as changing the priority (which requires us to remove and re-add a watch
3816 * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3817 * compromise. */
3818
3819 while ((d = e->inode_data_to_close)) {
3820 assert(d->fd >= 0);
3821 d->fd = safe_close(d->fd);
3822
3823 LIST_REMOVE(to_close, e->inode_data_to_close, d);
3824 }
3825 }
3826
sd_event_prepare(sd_event * e)3827 _public_ int sd_event_prepare(sd_event *e) {
3828 int r;
3829
3830 assert_return(e, -EINVAL);
3831 assert_return(e = event_resolve(e), -ENOPKG);
3832 assert_return(!event_pid_changed(e), -ECHILD);
3833 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3834 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
3835
3836 /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3837 * this check here once, since gettid() is typically not cached, and thus want to minimize
3838 * syscalls */
3839 assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
3840
3841 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
3842 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
3843
3844 if (e->exit_requested)
3845 goto pending;
3846
3847 e->iteration++;
3848
3849 e->state = SD_EVENT_PREPARING;
3850 r = event_prepare(e);
3851 e->state = SD_EVENT_INITIAL;
3852 if (r < 0)
3853 return r;
3854
3855 r = event_arm_timer(e, &e->realtime);
3856 if (r < 0)
3857 return r;
3858
3859 r = event_arm_timer(e, &e->boottime);
3860 if (r < 0)
3861 return r;
3862
3863 r = event_arm_timer(e, &e->monotonic);
3864 if (r < 0)
3865 return r;
3866
3867 r = event_arm_timer(e, &e->realtime_alarm);
3868 if (r < 0)
3869 return r;
3870
3871 r = event_arm_timer(e, &e->boottime_alarm);
3872 if (r < 0)
3873 return r;
3874
3875 event_close_inode_data_fds(e);
3876
3877 if (event_next_pending(e) || e->need_process_child)
3878 goto pending;
3879
3880 e->state = SD_EVENT_ARMED;
3881
3882 return 0;
3883
3884 pending:
3885 e->state = SD_EVENT_ARMED;
3886 r = sd_event_wait(e, 0);
3887 if (r == 0)
3888 e->state = SD_EVENT_ARMED;
3889
3890 return r;
3891 }
3892
epoll_wait_usec(int fd,struct epoll_event * events,int maxevents,usec_t timeout)3893 static int epoll_wait_usec(
3894 int fd,
3895 struct epoll_event *events,
3896 int maxevents,
3897 usec_t timeout) {
3898
3899 int msec;
3900 #if 0
3901 static bool epoll_pwait2_absent = false;
3902 int r;
3903
3904 /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not.
3905 *
3906 * FIXME: this is temporarily disabled until epoll_pwait2() becomes more widely available.
3907 * See https://github.com/systemd/systemd/pull/18973 and
3908 * https://github.com/systemd/systemd/issues/19052. */
3909
3910 if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
3911 r = epoll_pwait2(fd,
3912 events,
3913 maxevents,
3914 TIMESPEC_STORE(timeout),
3915 NULL);
3916 if (r >= 0)
3917 return r;
3918 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3919 return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
3920 * supported. */
3921
3922 epoll_pwait2_absent = true;
3923 }
3924 #endif
3925
3926 if (timeout == USEC_INFINITY)
3927 msec = -1;
3928 else {
3929 usec_t k;
3930
3931 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
3932 if (k >= INT_MAX)
3933 msec = INT_MAX; /* Saturate */
3934 else
3935 msec = (int) k;
3936 }
3937
3938 return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
3939 }
3940
process_epoll(sd_event * e,usec_t timeout,int64_t threshold,int64_t * ret_min_priority)3941 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
3942 size_t n_event_queue, m, n_event_max;
3943 int64_t min_priority = threshold;
3944 bool something_new = false;
3945 int r;
3946
3947 assert(e);
3948 assert(ret_min_priority);
3949
3950 n_event_queue = MAX(e->n_sources, 1u);
3951 if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
3952 return -ENOMEM;
3953
3954 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
3955
3956 /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
3957 if (e->inotify_data_buffered)
3958 timeout = 0;
3959
3960 for (;;) {
3961 r = epoll_wait_usec(
3962 e->epoll_fd,
3963 e->event_queue,
3964 n_event_max,
3965 timeout);
3966 if (r < 0)
3967 return r;
3968
3969 m = (size_t) r;
3970
3971 if (m < n_event_max)
3972 break;
3973
3974 if (n_event_max >= n_event_queue * 10)
3975 break;
3976
3977 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
3978 return -ENOMEM;
3979
3980 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
3981 timeout = 0;
3982 }
3983
3984 /* Set timestamp only when this is called first time. */
3985 if (threshold == INT64_MAX)
3986 triple_timestamp_get(&e->timestamp);
3987
3988 for (size_t i = 0; i < m; i++) {
3989
3990 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
3991 r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
3992 else {
3993 WakeupType *t = e->event_queue[i].data.ptr;
3994
3995 switch (*t) {
3996
3997 case WAKEUP_EVENT_SOURCE: {
3998 sd_event_source *s = e->event_queue[i].data.ptr;
3999
4000 assert(s);
4001
4002 if (s->priority > threshold)
4003 continue;
4004
4005 min_priority = MIN(min_priority, s->priority);
4006
4007 switch (s->type) {
4008
4009 case SOURCE_IO:
4010 r = process_io(e, s, e->event_queue[i].events);
4011 break;
4012
4013 case SOURCE_CHILD:
4014 r = process_pidfd(e, s, e->event_queue[i].events);
4015 break;
4016
4017 default:
4018 assert_not_reached();
4019 }
4020
4021 break;
4022 }
4023
4024 case WAKEUP_CLOCK_DATA: {
4025 struct clock_data *d = e->event_queue[i].data.ptr;
4026
4027 assert(d);
4028
4029 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4030 break;
4031 }
4032
4033 case WAKEUP_SIGNAL_DATA:
4034 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4035 break;
4036
4037 case WAKEUP_INOTIFY_DATA:
4038 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4039 break;
4040
4041 default:
4042 assert_not_reached();
4043 }
4044 }
4045 if (r < 0)
4046 return r;
4047 if (r > 0)
4048 something_new = true;
4049 }
4050
4051 *ret_min_priority = min_priority;
4052 return something_new;
4053 }
4054
sd_event_wait(sd_event * e,uint64_t timeout)4055 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4056 int r;
4057
4058 assert_return(e, -EINVAL);
4059 assert_return(e = event_resolve(e), -ENOPKG);
4060 assert_return(!event_pid_changed(e), -ECHILD);
4061 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4062 assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4063
4064 if (e->exit_requested) {
4065 e->state = SD_EVENT_PENDING;
4066 return 1;
4067 }
4068
4069 for (int64_t threshold = INT64_MAX; ; threshold--) {
4070 int64_t epoll_min_priority, child_min_priority;
4071
4072 /* There may be a possibility that new epoll (especially IO) and child events are
4073 * triggered just after process_epoll() call but before process_child(), and the new IO
4074 * events may have higher priority than the child events. To salvage these events,
4075 * let's call epoll_wait() again, but accepts only events with higher priority than the
4076 * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4077 * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4078 * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4079
4080 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4081 if (r == -EINTR) {
4082 e->state = SD_EVENT_PENDING;
4083 return 1;
4084 }
4085 if (r < 0)
4086 goto finish;
4087 if (r == 0 && threshold < INT64_MAX)
4088 /* No new epoll event. */
4089 break;
4090
4091 r = process_child(e, threshold, &child_min_priority);
4092 if (r < 0)
4093 goto finish;
4094 if (r == 0)
4095 /* No new child event. */
4096 break;
4097
4098 threshold = MIN(epoll_min_priority, child_min_priority);
4099 if (threshold == INT64_MIN)
4100 break;
4101
4102 timeout = 0;
4103 }
4104
4105 r = process_watchdog(e);
4106 if (r < 0)
4107 goto finish;
4108
4109 r = process_inotify(e);
4110 if (r < 0)
4111 goto finish;
4112
4113 r = process_timer(e, e->timestamp.realtime, &e->realtime);
4114 if (r < 0)
4115 goto finish;
4116
4117 r = process_timer(e, e->timestamp.boottime, &e->boottime);
4118 if (r < 0)
4119 goto finish;
4120
4121 r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4122 if (r < 0)
4123 goto finish;
4124
4125 r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4126 if (r < 0)
4127 goto finish;
4128
4129 r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4130 if (r < 0)
4131 goto finish;
4132 else if (r == 1) {
4133 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4134 * put loop in the initial state in order to evaluate (in the next iteration) also sources
4135 * there were potentially re-enabled by the callback.
4136 *
4137 * Wondering why we treat only this invocation of process_timer() differently? Once event
4138 * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4139 * ratelimit expiry callback is never called for any other timer type. */
4140 r = 0;
4141 goto finish;
4142 }
4143
4144 if (event_next_pending(e)) {
4145 e->state = SD_EVENT_PENDING;
4146 return 1;
4147 }
4148
4149 r = 0;
4150
4151 finish:
4152 e->state = SD_EVENT_INITIAL;
4153
4154 return r;
4155 }
4156
sd_event_dispatch(sd_event * e)4157 _public_ int sd_event_dispatch(sd_event *e) {
4158 sd_event_source *p;
4159 int r;
4160
4161 assert_return(e, -EINVAL);
4162 assert_return(e = event_resolve(e), -ENOPKG);
4163 assert_return(!event_pid_changed(e), -ECHILD);
4164 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4165 assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4166
4167 if (e->exit_requested)
4168 return dispatch_exit(e);
4169
4170 p = event_next_pending(e);
4171 if (p) {
4172 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4173
4174 e->state = SD_EVENT_RUNNING;
4175 r = source_dispatch(p);
4176 e->state = SD_EVENT_INITIAL;
4177 return r;
4178 }
4179
4180 e->state = SD_EVENT_INITIAL;
4181
4182 return 1;
4183 }
4184
event_log_delays(sd_event * e)4185 static void event_log_delays(sd_event *e) {
4186 char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4187 size_t l, i;
4188
4189 p = b;
4190 l = sizeof(b);
4191 for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4192 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4193 e->delays[i] = 0;
4194 }
4195 log_debug("Event loop iterations: %s", b);
4196 }
4197
sd_event_run(sd_event * e,uint64_t timeout)4198 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4199 int r;
4200
4201 assert_return(e, -EINVAL);
4202 assert_return(e = event_resolve(e), -ENOPKG);
4203 assert_return(!event_pid_changed(e), -ECHILD);
4204 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4205 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4206
4207 if (e->profile_delays && e->last_run_usec != 0) {
4208 usec_t this_run;
4209 unsigned l;
4210
4211 this_run = now(CLOCK_MONOTONIC);
4212
4213 l = log2u64(this_run - e->last_run_usec);
4214 assert(l < ELEMENTSOF(e->delays));
4215 e->delays[l]++;
4216
4217 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4218 event_log_delays(e);
4219 e->last_log_usec = this_run;
4220 }
4221 }
4222
4223 /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4224 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4225
4226 r = sd_event_prepare(e);
4227 if (r == 0)
4228 /* There was nothing? Then wait... */
4229 r = sd_event_wait(e, timeout);
4230
4231 if (e->profile_delays)
4232 e->last_run_usec = now(CLOCK_MONOTONIC);
4233
4234 if (r > 0) {
4235 /* There's something now, then let's dispatch it */
4236 r = sd_event_dispatch(e);
4237 if (r < 0)
4238 return r;
4239
4240 return 1;
4241 }
4242
4243 return r;
4244 }
4245
sd_event_loop(sd_event * e)4246 _public_ int sd_event_loop(sd_event *e) {
4247 int r;
4248
4249 assert_return(e, -EINVAL);
4250 assert_return(e = event_resolve(e), -ENOPKG);
4251 assert_return(!event_pid_changed(e), -ECHILD);
4252 assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4253
4254 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4255
4256 while (e->state != SD_EVENT_FINISHED) {
4257 r = sd_event_run(e, UINT64_MAX);
4258 if (r < 0)
4259 return r;
4260 }
4261
4262 return e->exit_code;
4263 }
4264
sd_event_get_fd(sd_event * e)4265 _public_ int sd_event_get_fd(sd_event *e) {
4266 assert_return(e, -EINVAL);
4267 assert_return(e = event_resolve(e), -ENOPKG);
4268 assert_return(!event_pid_changed(e), -ECHILD);
4269
4270 return e->epoll_fd;
4271 }
4272
sd_event_get_state(sd_event * e)4273 _public_ int sd_event_get_state(sd_event *e) {
4274 assert_return(e, -EINVAL);
4275 assert_return(e = event_resolve(e), -ENOPKG);
4276 assert_return(!event_pid_changed(e), -ECHILD);
4277
4278 return e->state;
4279 }
4280
sd_event_get_exit_code(sd_event * e,int * code)4281 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4282 assert_return(e, -EINVAL);
4283 assert_return(e = event_resolve(e), -ENOPKG);
4284 assert_return(code, -EINVAL);
4285 assert_return(!event_pid_changed(e), -ECHILD);
4286
4287 if (!e->exit_requested)
4288 return -ENODATA;
4289
4290 *code = e->exit_code;
4291 return 0;
4292 }
4293
sd_event_exit(sd_event * e,int code)4294 _public_ int sd_event_exit(sd_event *e, int code) {
4295 assert_return(e, -EINVAL);
4296 assert_return(e = event_resolve(e), -ENOPKG);
4297 assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4298 assert_return(!event_pid_changed(e), -ECHILD);
4299
4300 e->exit_requested = true;
4301 e->exit_code = code;
4302
4303 return 0;
4304 }
4305
sd_event_now(sd_event * e,clockid_t clock,uint64_t * usec)4306 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4307 assert_return(e, -EINVAL);
4308 assert_return(e = event_resolve(e), -ENOPKG);
4309 assert_return(usec, -EINVAL);
4310 assert_return(!event_pid_changed(e), -ECHILD);
4311
4312 if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4313 return -EOPNOTSUPP;
4314
4315 if (!triple_timestamp_is_set(&e->timestamp)) {
4316 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4317 *usec = now(clock);
4318 return 1;
4319 }
4320
4321 *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4322 return 0;
4323 }
4324
sd_event_default(sd_event ** ret)4325 _public_ int sd_event_default(sd_event **ret) {
4326 sd_event *e = NULL;
4327 int r;
4328
4329 if (!ret)
4330 return !!default_event;
4331
4332 if (default_event) {
4333 *ret = sd_event_ref(default_event);
4334 return 0;
4335 }
4336
4337 r = sd_event_new(&e);
4338 if (r < 0)
4339 return r;
4340
4341 e->default_event_ptr = &default_event;
4342 e->tid = gettid();
4343 default_event = e;
4344
4345 *ret = e;
4346 return 1;
4347 }
4348
sd_event_get_tid(sd_event * e,pid_t * tid)4349 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4350 assert_return(e, -EINVAL);
4351 assert_return(e = event_resolve(e), -ENOPKG);
4352 assert_return(tid, -EINVAL);
4353 assert_return(!event_pid_changed(e), -ECHILD);
4354
4355 if (e->tid != 0) {
4356 *tid = e->tid;
4357 return 0;
4358 }
4359
4360 return -ENXIO;
4361 }
4362
sd_event_set_watchdog(sd_event * e,int b)4363 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4364 int r;
4365
4366 assert_return(e, -EINVAL);
4367 assert_return(e = event_resolve(e), -ENOPKG);
4368 assert_return(!event_pid_changed(e), -ECHILD);
4369
4370 if (e->watchdog == !!b)
4371 return e->watchdog;
4372
4373 if (b) {
4374 r = sd_watchdog_enabled(false, &e->watchdog_period);
4375 if (r <= 0)
4376 return r;
4377
4378 /* Issue first ping immediately */
4379 sd_notify(false, "WATCHDOG=1");
4380 e->watchdog_last = now(CLOCK_MONOTONIC);
4381
4382 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4383 if (e->watchdog_fd < 0)
4384 return -errno;
4385
4386 r = arm_watchdog(e);
4387 if (r < 0)
4388 goto fail;
4389
4390 struct epoll_event ev = {
4391 .events = EPOLLIN,
4392 .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4393 };
4394
4395 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
4396 r = -errno;
4397 goto fail;
4398 }
4399
4400 } else {
4401 if (e->watchdog_fd >= 0) {
4402 (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
4403 e->watchdog_fd = safe_close(e->watchdog_fd);
4404 }
4405 }
4406
4407 e->watchdog = !!b;
4408 return e->watchdog;
4409
4410 fail:
4411 e->watchdog_fd = safe_close(e->watchdog_fd);
4412 return r;
4413 }
4414
sd_event_get_watchdog(sd_event * e)4415 _public_ int sd_event_get_watchdog(sd_event *e) {
4416 assert_return(e, -EINVAL);
4417 assert_return(e = event_resolve(e), -ENOPKG);
4418 assert_return(!event_pid_changed(e), -ECHILD);
4419
4420 return e->watchdog;
4421 }
4422
sd_event_get_iteration(sd_event * e,uint64_t * ret)4423 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
4424 assert_return(e, -EINVAL);
4425 assert_return(e = event_resolve(e), -ENOPKG);
4426 assert_return(!event_pid_changed(e), -ECHILD);
4427
4428 *ret = e->iteration;
4429 return 0;
4430 }
4431
sd_event_source_set_destroy_callback(sd_event_source * s,sd_event_destroy_t callback)4432 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
4433 assert_return(s, -EINVAL);
4434
4435 s->destroy_callback = callback;
4436 return 0;
4437 }
4438
sd_event_source_get_destroy_callback(sd_event_source * s,sd_event_destroy_t * ret)4439 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
4440 assert_return(s, -EINVAL);
4441
4442 if (ret)
4443 *ret = s->destroy_callback;
4444
4445 return !!s->destroy_callback;
4446 }
4447
sd_event_source_get_floating(sd_event_source * s)4448 _public_ int sd_event_source_get_floating(sd_event_source *s) {
4449 assert_return(s, -EINVAL);
4450
4451 return s->floating;
4452 }
4453
sd_event_source_set_floating(sd_event_source * s,int b)4454 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
4455 assert_return(s, -EINVAL);
4456
4457 if (s->floating == !!b)
4458 return 0;
4459
4460 if (!s->event) /* Already disconnected */
4461 return -ESTALE;
4462
4463 s->floating = b;
4464
4465 if (b) {
4466 sd_event_source_ref(s);
4467 sd_event_unref(s->event);
4468 } else {
4469 sd_event_ref(s->event);
4470 sd_event_source_unref(s);
4471 }
4472
4473 return 1;
4474 }
4475
sd_event_source_get_exit_on_failure(sd_event_source * s)4476 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
4477 assert_return(s, -EINVAL);
4478 assert_return(s->type != SOURCE_EXIT, -EDOM);
4479
4480 return s->exit_on_failure;
4481 }
4482
sd_event_source_set_exit_on_failure(sd_event_source * s,int b)4483 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
4484 assert_return(s, -EINVAL);
4485 assert_return(s->type != SOURCE_EXIT, -EDOM);
4486
4487 if (s->exit_on_failure == !!b)
4488 return 0;
4489
4490 s->exit_on_failure = b;
4491 return 1;
4492 }
4493
sd_event_source_set_ratelimit(sd_event_source * s,uint64_t interval,unsigned burst)4494 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
4495 int r;
4496
4497 assert_return(s, -EINVAL);
4498
4499 /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4500 * so is a programming error. */
4501 assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
4502
4503 /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4504 * non-ratelimited. */
4505 r = event_source_leave_ratelimit(s, /* run_callback */ false);
4506 if (r < 0)
4507 return r;
4508
4509 s->rate_limit = (RateLimit) { interval, burst };
4510 return 0;
4511 }
4512
sd_event_source_set_ratelimit_expire_callback(sd_event_source * s,sd_event_handler_t callback)4513 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
4514 assert_return(s, -EINVAL);
4515
4516 s->ratelimit_expire_callback = callback;
4517 return 0;
4518 }
4519
sd_event_source_get_ratelimit(sd_event_source * s,uint64_t * ret_interval,unsigned * ret_burst)4520 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
4521 assert_return(s, -EINVAL);
4522
4523 /* Querying whether an event source has ratelimiting configured is not a loggable offsense, hence
4524 * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error */
4525 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4526 return -EDOM;
4527
4528 if (!ratelimit_configured(&s->rate_limit))
4529 return -ENOEXEC;
4530
4531 if (ret_interval)
4532 *ret_interval = s->rate_limit.interval;
4533 if (ret_burst)
4534 *ret_burst = s->rate_limit.burst;
4535
4536 return 0;
4537 }
4538
sd_event_source_is_ratelimited(sd_event_source * s)4539 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
4540 assert_return(s, -EINVAL);
4541
4542 if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4543 return false;
4544
4545 if (!ratelimit_configured(&s->rate_limit))
4546 return false;
4547
4548 return s->ratelimited;
4549 }
4550