1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <sys/epoll.h>
4 #include <sys/timerfd.h>
5 #include <sys/wait.h>
6 
7 #include "sd-daemon.h"
8 #include "sd-event.h"
9 #include "sd-id128.h"
10 
11 #include "alloc-util.h"
12 #include "env-util.h"
13 #include "event-source.h"
14 #include "fd-util.h"
15 #include "fs-util.h"
16 #include "hashmap.h"
17 #include "list.h"
18 #include "macro.h"
19 #include "memory-util.h"
20 #include "missing_syscall.h"
21 #include "prioq.h"
22 #include "process-util.h"
23 #include "set.h"
24 #include "signal-util.h"
25 #include "string-table.h"
26 #include "string-util.h"
27 #include "strxcpyx.h"
28 #include "time-util.h"
29 
30 #define DEFAULT_ACCURACY_USEC (250 * USEC_PER_MSEC)
31 
EVENT_SOURCE_WATCH_PIDFD(sd_event_source * s)32 static bool EVENT_SOURCE_WATCH_PIDFD(sd_event_source *s) {
33         /* Returns true if this is a PID event source and can be implemented by watching EPOLLIN */
34         return s &&
35                 s->type == SOURCE_CHILD &&
36                 s->child.pidfd >= 0 &&
37                 s->child.options == WEXITED;
38 }
39 
event_source_is_online(sd_event_source * s)40 static bool event_source_is_online(sd_event_source *s) {
41         assert(s);
42         return s->enabled != SD_EVENT_OFF && !s->ratelimited;
43 }
44 
event_source_is_offline(sd_event_source * s)45 static bool event_source_is_offline(sd_event_source *s) {
46         assert(s);
47         return s->enabled == SD_EVENT_OFF || s->ratelimited;
48 }
49 
50 static const char* const event_source_type_table[_SOURCE_EVENT_SOURCE_TYPE_MAX] = {
51         [SOURCE_IO] = "io",
52         [SOURCE_TIME_REALTIME] = "realtime",
53         [SOURCE_TIME_BOOTTIME] = "bootime",
54         [SOURCE_TIME_MONOTONIC] = "monotonic",
55         [SOURCE_TIME_REALTIME_ALARM] = "realtime-alarm",
56         [SOURCE_TIME_BOOTTIME_ALARM] = "boottime-alarm",
57         [SOURCE_SIGNAL] = "signal",
58         [SOURCE_CHILD] = "child",
59         [SOURCE_DEFER] = "defer",
60         [SOURCE_POST] = "post",
61         [SOURCE_EXIT] = "exit",
62         [SOURCE_WATCHDOG] = "watchdog",
63         [SOURCE_INOTIFY] = "inotify",
64 };
65 
66 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(event_source_type, int);
67 
68 #define EVENT_SOURCE_IS_TIME(t)                 \
69         IN_SET((t),                             \
70                SOURCE_TIME_REALTIME,            \
71                SOURCE_TIME_BOOTTIME,            \
72                SOURCE_TIME_MONOTONIC,           \
73                SOURCE_TIME_REALTIME_ALARM,      \
74                SOURCE_TIME_BOOTTIME_ALARM)
75 
76 #define EVENT_SOURCE_CAN_RATE_LIMIT(t)          \
77         IN_SET((t),                             \
78                SOURCE_IO,                       \
79                SOURCE_TIME_REALTIME,            \
80                SOURCE_TIME_BOOTTIME,            \
81                SOURCE_TIME_MONOTONIC,           \
82                SOURCE_TIME_REALTIME_ALARM,      \
83                SOURCE_TIME_BOOTTIME_ALARM,      \
84                SOURCE_SIGNAL,                   \
85                SOURCE_DEFER,                    \
86                SOURCE_INOTIFY)
87 
88 /* This is used to assert that we didn't pass an unexpected source type to event_source_time_prioq_put().
89  * Time sources and ratelimited sources can be passed, so effectively this is the same as the
90  * EVENT_SOURCE_CAN_RATE_LIMIT() macro. */
91 #define EVENT_SOURCE_USES_TIME_PRIOQ(t) EVENT_SOURCE_CAN_RATE_LIMIT(t)
92 
93 struct sd_event {
94         unsigned n_ref;
95 
96         int epoll_fd;
97         int watchdog_fd;
98 
99         Prioq *pending;
100         Prioq *prepare;
101 
102         /* timerfd_create() only supports these five clocks so far. We
103          * can add support for more clocks when the kernel learns to
104          * deal with them, too. */
105         struct clock_data realtime;
106         struct clock_data boottime;
107         struct clock_data monotonic;
108         struct clock_data realtime_alarm;
109         struct clock_data boottime_alarm;
110 
111         usec_t perturb;
112 
113         sd_event_source **signal_sources; /* indexed by signal number */
114         Hashmap *signal_data; /* indexed by priority */
115 
116         Hashmap *child_sources;
117         unsigned n_online_child_sources;
118 
119         Set *post_sources;
120 
121         Prioq *exit;
122 
123         Hashmap *inotify_data; /* indexed by priority */
124 
125         /* A list of inode structures that still have an fd open, that we need to close before the next loop iteration */
126         LIST_HEAD(struct inode_data, inode_data_to_close);
127 
128         /* A list of inotify objects that already have events buffered which aren't processed yet */
129         LIST_HEAD(struct inotify_data, inotify_data_buffered);
130 
131         pid_t original_pid;
132 
133         uint64_t iteration;
134         triple_timestamp timestamp;
135         int state;
136 
137         bool exit_requested:1;
138         bool need_process_child:1;
139         bool watchdog:1;
140         bool profile_delays:1;
141 
142         int exit_code;
143 
144         pid_t tid;
145         sd_event **default_event_ptr;
146 
147         usec_t watchdog_last, watchdog_period;
148 
149         unsigned n_sources;
150 
151         struct epoll_event *event_queue;
152 
153         LIST_HEAD(sd_event_source, sources);
154 
155         usec_t last_run_usec, last_log_usec;
156         unsigned delays[sizeof(usec_t) * 8];
157 };
158 
159 static thread_local sd_event *default_event = NULL;
160 
161 static void source_disconnect(sd_event_source *s);
162 static void event_gc_inode_data(sd_event *e, struct inode_data *d);
163 
event_resolve(sd_event * e)164 static sd_event *event_resolve(sd_event *e) {
165         return e == SD_EVENT_DEFAULT ? default_event : e;
166 }
167 
pending_prioq_compare(const void * a,const void * b)168 static int pending_prioq_compare(const void *a, const void *b) {
169         const sd_event_source *x = a, *y = b;
170         int r;
171 
172         assert(x->pending);
173         assert(y->pending);
174 
175         /* Enabled ones first */
176         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
177         if (r != 0)
178                 return r;
179 
180         /* Non rate-limited ones first. */
181         r = CMP(!!x->ratelimited, !!y->ratelimited);
182         if (r != 0)
183                 return r;
184 
185         /* Lower priority values first */
186         r = CMP(x->priority, y->priority);
187         if (r != 0)
188                 return r;
189 
190         /* Older entries first */
191         return CMP(x->pending_iteration, y->pending_iteration);
192 }
193 
prepare_prioq_compare(const void * a,const void * b)194 static int prepare_prioq_compare(const void *a, const void *b) {
195         const sd_event_source *x = a, *y = b;
196         int r;
197 
198         assert(x->prepare);
199         assert(y->prepare);
200 
201         /* Enabled ones first */
202         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
203         if (r != 0)
204                 return r;
205 
206         /* Non rate-limited ones first. */
207         r = CMP(!!x->ratelimited, !!y->ratelimited);
208         if (r != 0)
209                 return r;
210 
211         /* Move most recently prepared ones last, so that we can stop
212          * preparing as soon as we hit one that has already been
213          * prepared in the current iteration */
214         r = CMP(x->prepare_iteration, y->prepare_iteration);
215         if (r != 0)
216                 return r;
217 
218         /* Lower priority values first */
219         return CMP(x->priority, y->priority);
220 }
221 
time_event_source_next(const sd_event_source * s)222 static usec_t time_event_source_next(const sd_event_source *s) {
223         assert(s);
224 
225         /* We have two kinds of event sources that have elapsation times associated with them: the actual
226          * time based ones and the ones for which a ratelimit can be in effect (where we want to be notified
227          * once the ratelimit time window ends). Let's return the next elapsing time depending on what we are
228          * looking at here. */
229 
230         if (s->ratelimited) { /* If rate-limited the next elapsation is when the ratelimit time window ends */
231                 assert(s->rate_limit.begin != 0);
232                 assert(s->rate_limit.interval != 0);
233                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
234         }
235 
236         /* Otherwise this must be a time event source, if not ratelimited */
237         if (EVENT_SOURCE_IS_TIME(s->type))
238                 return s->time.next;
239 
240         return USEC_INFINITY;
241 }
242 
time_event_source_latest(const sd_event_source * s)243 static usec_t time_event_source_latest(const sd_event_source *s) {
244         assert(s);
245 
246         if (s->ratelimited) { /* For ratelimited stuff the earliest and the latest time shall actually be the
247                                * same, as we should avoid adding additional inaccuracy on an inaccuracy time
248                                * window */
249                 assert(s->rate_limit.begin != 0);
250                 assert(s->rate_limit.interval != 0);
251                 return usec_add(s->rate_limit.begin, s->rate_limit.interval);
252         }
253 
254         /* Must be a time event source, if not ratelimited */
255         if (EVENT_SOURCE_IS_TIME(s->type))
256                 return usec_add(s->time.next, s->time.accuracy);
257 
258         return USEC_INFINITY;
259 }
260 
event_source_timer_candidate(const sd_event_source * s)261 static bool event_source_timer_candidate(const sd_event_source *s) {
262         assert(s);
263 
264         /* Returns true for event sources that either are not pending yet (i.e. where it's worth to mark them pending)
265          * or which are currently ratelimited (i.e. where it's worth leaving the ratelimited state) */
266         return !s->pending || s->ratelimited;
267 }
268 
time_prioq_compare(const void * a,const void * b,usec_t (* time_func)(const sd_event_source * s))269 static int time_prioq_compare(const void *a, const void *b, usec_t (*time_func)(const sd_event_source *s)) {
270         const sd_event_source *x = a, *y = b;
271         int r;
272 
273         /* Enabled ones first */
274         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
275         if (r != 0)
276                 return r;
277 
278         /* Order "non-pending OR ratelimited" before "pending AND not-ratelimited" */
279         r = CMP(!event_source_timer_candidate(x), !event_source_timer_candidate(y));
280         if (r != 0)
281                 return r;
282 
283         /* Order by time */
284         return CMP(time_func(x), time_func(y));
285 }
286 
earliest_time_prioq_compare(const void * a,const void * b)287 static int earliest_time_prioq_compare(const void *a, const void *b) {
288         return time_prioq_compare(a, b, time_event_source_next);
289 }
290 
latest_time_prioq_compare(const void * a,const void * b)291 static int latest_time_prioq_compare(const void *a, const void *b) {
292         return time_prioq_compare(a, b, time_event_source_latest);
293 }
294 
exit_prioq_compare(const void * a,const void * b)295 static int exit_prioq_compare(const void *a, const void *b) {
296         const sd_event_source *x = a, *y = b;
297         int r;
298 
299         assert(x->type == SOURCE_EXIT);
300         assert(y->type == SOURCE_EXIT);
301 
302         /* Enabled ones first */
303         r = CMP(x->enabled == SD_EVENT_OFF, y->enabled == SD_EVENT_OFF);
304         if (r != 0)
305                 return r;
306 
307         /* Lower priority values first */
308         return CMP(x->priority, y->priority);
309 }
310 
free_clock_data(struct clock_data * d)311 static void free_clock_data(struct clock_data *d) {
312         assert(d);
313         assert(d->wakeup == WAKEUP_CLOCK_DATA);
314 
315         safe_close(d->fd);
316         prioq_free(d->earliest);
317         prioq_free(d->latest);
318 }
319 
event_free(sd_event * e)320 static sd_event *event_free(sd_event *e) {
321         sd_event_source *s;
322 
323         assert(e);
324 
325         while ((s = e->sources)) {
326                 assert(s->floating);
327                 source_disconnect(s);
328                 sd_event_source_unref(s);
329         }
330 
331         assert(e->n_sources == 0);
332 
333         if (e->default_event_ptr)
334                 *(e->default_event_ptr) = NULL;
335 
336         safe_close(e->epoll_fd);
337         safe_close(e->watchdog_fd);
338 
339         free_clock_data(&e->realtime);
340         free_clock_data(&e->boottime);
341         free_clock_data(&e->monotonic);
342         free_clock_data(&e->realtime_alarm);
343         free_clock_data(&e->boottime_alarm);
344 
345         prioq_free(e->pending);
346         prioq_free(e->prepare);
347         prioq_free(e->exit);
348 
349         free(e->signal_sources);
350         hashmap_free(e->signal_data);
351 
352         hashmap_free(e->inotify_data);
353 
354         hashmap_free(e->child_sources);
355         set_free(e->post_sources);
356 
357         free(e->event_queue);
358 
359         return mfree(e);
360 }
361 
sd_event_new(sd_event ** ret)362 _public_ int sd_event_new(sd_event** ret) {
363         sd_event *e;
364         int r;
365 
366         assert_return(ret, -EINVAL);
367 
368         e = new(sd_event, 1);
369         if (!e)
370                 return -ENOMEM;
371 
372         *e = (sd_event) {
373                 .n_ref = 1,
374                 .epoll_fd = -1,
375                 .watchdog_fd = -1,
376                 .realtime.wakeup = WAKEUP_CLOCK_DATA,
377                 .realtime.fd = -1,
378                 .realtime.next = USEC_INFINITY,
379                 .boottime.wakeup = WAKEUP_CLOCK_DATA,
380                 .boottime.fd = -1,
381                 .boottime.next = USEC_INFINITY,
382                 .monotonic.wakeup = WAKEUP_CLOCK_DATA,
383                 .monotonic.fd = -1,
384                 .monotonic.next = USEC_INFINITY,
385                 .realtime_alarm.wakeup = WAKEUP_CLOCK_DATA,
386                 .realtime_alarm.fd = -1,
387                 .realtime_alarm.next = USEC_INFINITY,
388                 .boottime_alarm.wakeup = WAKEUP_CLOCK_DATA,
389                 .boottime_alarm.fd = -1,
390                 .boottime_alarm.next = USEC_INFINITY,
391                 .perturb = USEC_INFINITY,
392                 .original_pid = getpid_cached(),
393         };
394 
395         r = prioq_ensure_allocated(&e->pending, pending_prioq_compare);
396         if (r < 0)
397                 goto fail;
398 
399         e->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
400         if (e->epoll_fd < 0) {
401                 r = -errno;
402                 goto fail;
403         }
404 
405         e->epoll_fd = fd_move_above_stdio(e->epoll_fd);
406 
407         if (secure_getenv("SD_EVENT_PROFILE_DELAYS")) {
408                 log_debug("Event loop profiling enabled. Logarithmic histogram of event loop iterations in the range 2^0 … 2^63 us will be logged every 5s.");
409                 e->profile_delays = true;
410         }
411 
412         *ret = e;
413         return 0;
414 
415 fail:
416         event_free(e);
417         return r;
418 }
419 
420 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event, sd_event, event_free);
421 
sd_event_source_disable_unref(sd_event_source * s)422 _public_ sd_event_source* sd_event_source_disable_unref(sd_event_source *s) {
423         if (s)
424                 (void) sd_event_source_set_enabled(s, SD_EVENT_OFF);
425         return sd_event_source_unref(s);
426 }
427 
event_pid_changed(sd_event * e)428 static bool event_pid_changed(sd_event *e) {
429         assert(e);
430 
431         /* We don't support people creating an event loop and keeping
432          * it around over a fork(). Let's complain. */
433 
434         return e->original_pid != getpid_cached();
435 }
436 
source_io_unregister(sd_event_source * s)437 static void source_io_unregister(sd_event_source *s) {
438         assert(s);
439         assert(s->type == SOURCE_IO);
440 
441         if (event_pid_changed(s->event))
442                 return;
443 
444         if (!s->io.registered)
445                 return;
446 
447         if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->io.fd, NULL) < 0)
448                 log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
449                                 strna(s->description), event_source_type_to_string(s->type));
450 
451         s->io.registered = false;
452 }
453 
source_io_register(sd_event_source * s,int enabled,uint32_t events)454 static int source_io_register(
455                 sd_event_source *s,
456                 int enabled,
457                 uint32_t events) {
458 
459         assert(s);
460         assert(s->type == SOURCE_IO);
461         assert(enabled != SD_EVENT_OFF);
462 
463         struct epoll_event ev = {
464                 .events = events | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
465                 .data.ptr = s,
466         };
467 
468         if (epoll_ctl(s->event->epoll_fd,
469                       s->io.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
470                       s->io.fd, &ev) < 0)
471                 return -errno;
472 
473         s->io.registered = true;
474 
475         return 0;
476 }
477 
source_child_pidfd_unregister(sd_event_source * s)478 static void source_child_pidfd_unregister(sd_event_source *s) {
479         assert(s);
480         assert(s->type == SOURCE_CHILD);
481 
482         if (event_pid_changed(s->event))
483                 return;
484 
485         if (!s->child.registered)
486                 return;
487 
488         if (EVENT_SOURCE_WATCH_PIDFD(s))
489                 if (epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, s->child.pidfd, NULL) < 0)
490                         log_debug_errno(errno, "Failed to remove source %s (type %s) from epoll, ignoring: %m",
491                                         strna(s->description), event_source_type_to_string(s->type));
492 
493         s->child.registered = false;
494 }
495 
source_child_pidfd_register(sd_event_source * s,int enabled)496 static int source_child_pidfd_register(sd_event_source *s, int enabled) {
497         assert(s);
498         assert(s->type == SOURCE_CHILD);
499         assert(enabled != SD_EVENT_OFF);
500 
501         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
502                 struct epoll_event ev = {
503                         .events = EPOLLIN | (enabled == SD_EVENT_ONESHOT ? EPOLLONESHOT : 0),
504                         .data.ptr = s,
505                 };
506 
507                 if (epoll_ctl(s->event->epoll_fd,
508                               s->child.registered ? EPOLL_CTL_MOD : EPOLL_CTL_ADD,
509                               s->child.pidfd, &ev) < 0)
510                         return -errno;
511         }
512 
513         s->child.registered = true;
514         return 0;
515 }
516 
event_source_type_to_clock(EventSourceType t)517 static clockid_t event_source_type_to_clock(EventSourceType t) {
518 
519         switch (t) {
520 
521         case SOURCE_TIME_REALTIME:
522                 return CLOCK_REALTIME;
523 
524         case SOURCE_TIME_BOOTTIME:
525                 return CLOCK_BOOTTIME;
526 
527         case SOURCE_TIME_MONOTONIC:
528                 return CLOCK_MONOTONIC;
529 
530         case SOURCE_TIME_REALTIME_ALARM:
531                 return CLOCK_REALTIME_ALARM;
532 
533         case SOURCE_TIME_BOOTTIME_ALARM:
534                 return CLOCK_BOOTTIME_ALARM;
535 
536         default:
537                 return (clockid_t) -1;
538         }
539 }
540 
clock_to_event_source_type(clockid_t clock)541 static EventSourceType clock_to_event_source_type(clockid_t clock) {
542 
543         switch (clock) {
544 
545         case CLOCK_REALTIME:
546                 return SOURCE_TIME_REALTIME;
547 
548         case CLOCK_BOOTTIME:
549                 return SOURCE_TIME_BOOTTIME;
550 
551         case CLOCK_MONOTONIC:
552                 return SOURCE_TIME_MONOTONIC;
553 
554         case CLOCK_REALTIME_ALARM:
555                 return SOURCE_TIME_REALTIME_ALARM;
556 
557         case CLOCK_BOOTTIME_ALARM:
558                 return SOURCE_TIME_BOOTTIME_ALARM;
559 
560         default:
561                 return _SOURCE_EVENT_SOURCE_TYPE_INVALID;
562         }
563 }
564 
event_get_clock_data(sd_event * e,EventSourceType t)565 static struct clock_data* event_get_clock_data(sd_event *e, EventSourceType t) {
566         assert(e);
567 
568         switch (t) {
569 
570         case SOURCE_TIME_REALTIME:
571                 return &e->realtime;
572 
573         case SOURCE_TIME_BOOTTIME:
574                 return &e->boottime;
575 
576         case SOURCE_TIME_MONOTONIC:
577                 return &e->monotonic;
578 
579         case SOURCE_TIME_REALTIME_ALARM:
580                 return &e->realtime_alarm;
581 
582         case SOURCE_TIME_BOOTTIME_ALARM:
583                 return &e->boottime_alarm;
584 
585         default:
586                 return NULL;
587         }
588 }
589 
event_free_signal_data(sd_event * e,struct signal_data * d)590 static void event_free_signal_data(sd_event *e, struct signal_data *d) {
591         assert(e);
592 
593         if (!d)
594                 return;
595 
596         hashmap_remove(e->signal_data, &d->priority);
597         safe_close(d->fd);
598         free(d);
599 }
600 
event_make_signal_data(sd_event * e,int sig,struct signal_data ** ret)601 static int event_make_signal_data(
602                 sd_event *e,
603                 int sig,
604                 struct signal_data **ret) {
605 
606         struct signal_data *d;
607         bool added = false;
608         sigset_t ss_copy;
609         int64_t priority;
610         int r;
611 
612         assert(e);
613 
614         if (event_pid_changed(e))
615                 return -ECHILD;
616 
617         if (e->signal_sources && e->signal_sources[sig])
618                 priority = e->signal_sources[sig]->priority;
619         else
620                 priority = SD_EVENT_PRIORITY_NORMAL;
621 
622         d = hashmap_get(e->signal_data, &priority);
623         if (d) {
624                 if (sigismember(&d->sigset, sig) > 0) {
625                         if (ret)
626                                 *ret = d;
627                         return 0;
628                 }
629         } else {
630                 d = new(struct signal_data, 1);
631                 if (!d)
632                         return -ENOMEM;
633 
634                 *d = (struct signal_data) {
635                         .wakeup = WAKEUP_SIGNAL_DATA,
636                         .fd = -1,
637                         .priority = priority,
638                 };
639 
640                 r = hashmap_ensure_put(&e->signal_data, &uint64_hash_ops, &d->priority, d);
641                 if (r < 0) {
642                         free(d);
643                         return r;
644                 }
645 
646                 added = true;
647         }
648 
649         ss_copy = d->sigset;
650         assert_se(sigaddset(&ss_copy, sig) >= 0);
651 
652         r = signalfd(d->fd, &ss_copy, SFD_NONBLOCK|SFD_CLOEXEC);
653         if (r < 0) {
654                 r = -errno;
655                 goto fail;
656         }
657 
658         d->sigset = ss_copy;
659 
660         if (d->fd >= 0) {
661                 if (ret)
662                         *ret = d;
663                 return 0;
664         }
665 
666         d->fd = fd_move_above_stdio(r);
667 
668         struct epoll_event ev = {
669                 .events = EPOLLIN,
670                 .data.ptr = d,
671         };
672 
673         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
674                 r = -errno;
675                 goto fail;
676         }
677 
678         if (ret)
679                 *ret = d;
680 
681         return 0;
682 
683 fail:
684         if (added)
685                 event_free_signal_data(e, d);
686 
687         return r;
688 }
689 
event_unmask_signal_data(sd_event * e,struct signal_data * d,int sig)690 static void event_unmask_signal_data(sd_event *e, struct signal_data *d, int sig) {
691         assert(e);
692         assert(d);
693 
694         /* Turns off the specified signal in the signal data
695          * object. If the signal mask of the object becomes empty that
696          * way removes it. */
697 
698         if (sigismember(&d->sigset, sig) == 0)
699                 return;
700 
701         assert_se(sigdelset(&d->sigset, sig) >= 0);
702 
703         if (sigisemptyset(&d->sigset)) {
704                 /* If all the mask is all-zero we can get rid of the structure */
705                 event_free_signal_data(e, d);
706                 return;
707         }
708 
709         if (event_pid_changed(e))
710                 return;
711 
712         assert(d->fd >= 0);
713 
714         if (signalfd(d->fd, &d->sigset, SFD_NONBLOCK|SFD_CLOEXEC) < 0)
715                 log_debug_errno(errno, "Failed to unset signal bit, ignoring: %m");
716 }
717 
event_gc_signal_data(sd_event * e,const int64_t * priority,int sig)718 static void event_gc_signal_data(sd_event *e, const int64_t *priority, int sig) {
719         struct signal_data *d;
720         static const int64_t zero_priority = 0;
721 
722         assert(e);
723 
724         /* Rechecks if the specified signal is still something we are interested in. If not, we'll unmask it,
725          * and possibly drop the signalfd for it. */
726 
727         if (sig == SIGCHLD &&
728             e->n_online_child_sources > 0)
729                 return;
730 
731         if (e->signal_sources &&
732             e->signal_sources[sig] &&
733             event_source_is_online(e->signal_sources[sig]))
734                 return;
735 
736         /*
737          * The specified signal might be enabled in three different queues:
738          *
739          * 1) the one that belongs to the priority passed (if it is non-NULL)
740          * 2) the one that belongs to the priority of the event source of the signal (if there is one)
741          * 3) the 0 priority (to cover the SIGCHLD case)
742          *
743          * Hence, let's remove it from all three here.
744          */
745 
746         if (priority) {
747                 d = hashmap_get(e->signal_data, priority);
748                 if (d)
749                         event_unmask_signal_data(e, d, sig);
750         }
751 
752         if (e->signal_sources && e->signal_sources[sig]) {
753                 d = hashmap_get(e->signal_data, &e->signal_sources[sig]->priority);
754                 if (d)
755                         event_unmask_signal_data(e, d, sig);
756         }
757 
758         d = hashmap_get(e->signal_data, &zero_priority);
759         if (d)
760                 event_unmask_signal_data(e, d, sig);
761 }
762 
event_source_pp_prioq_reshuffle(sd_event_source * s)763 static void event_source_pp_prioq_reshuffle(sd_event_source *s) {
764         assert(s);
765 
766         /* Reshuffles the pending + prepare prioqs. Called whenever the dispatch order changes, i.e. when
767          * they are enabled/disabled or marked pending and such. */
768 
769         if (s->pending)
770                 prioq_reshuffle(s->event->pending, s, &s->pending_index);
771 
772         if (s->prepare)
773                 prioq_reshuffle(s->event->prepare, s, &s->prepare_index);
774 }
775 
event_source_time_prioq_reshuffle(sd_event_source * s)776 static void event_source_time_prioq_reshuffle(sd_event_source *s) {
777         struct clock_data *d;
778 
779         assert(s);
780 
781         /* Called whenever the event source's timer ordering properties changed, i.e. time, accuracy,
782          * pending, enable state, and ratelimiting state. Makes sure the two prioq's are ordered
783          * properly again. */
784 
785         if (s->ratelimited)
786                 d = &s->event->monotonic;
787         else if (EVENT_SOURCE_IS_TIME(s->type))
788                 assert_se(d = event_get_clock_data(s->event, s->type));
789         else
790                 return; /* no-op for an event source which is neither a timer nor ratelimited. */
791 
792         prioq_reshuffle(d->earliest, s, &s->earliest_index);
793         prioq_reshuffle(d->latest, s, &s->latest_index);
794         d->needs_rearm = true;
795 }
796 
event_source_time_prioq_remove(sd_event_source * s,struct clock_data * d)797 static void event_source_time_prioq_remove(
798                 sd_event_source *s,
799                 struct clock_data *d) {
800 
801         assert(s);
802         assert(d);
803 
804         prioq_remove(d->earliest, s, &s->earliest_index);
805         prioq_remove(d->latest, s, &s->latest_index);
806         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
807         d->needs_rearm = true;
808 }
809 
source_disconnect(sd_event_source * s)810 static void source_disconnect(sd_event_source *s) {
811         sd_event *event;
812 
813         assert(s);
814 
815         if (!s->event)
816                 return;
817 
818         assert(s->event->n_sources > 0);
819 
820         switch (s->type) {
821 
822         case SOURCE_IO:
823                 if (s->io.fd >= 0)
824                         source_io_unregister(s);
825 
826                 break;
827 
828         case SOURCE_TIME_REALTIME:
829         case SOURCE_TIME_BOOTTIME:
830         case SOURCE_TIME_MONOTONIC:
831         case SOURCE_TIME_REALTIME_ALARM:
832         case SOURCE_TIME_BOOTTIME_ALARM:
833                 /* Only remove this event source from the time event source here if it is not ratelimited. If
834                  * it is ratelimited, we'll remove it below, separately. Why? Because the clock used might
835                  * differ: ratelimiting always uses CLOCK_MONOTONIC, but timer events might use any clock */
836 
837                 if (!s->ratelimited) {
838                         struct clock_data *d;
839                         assert_se(d = event_get_clock_data(s->event, s->type));
840                         event_source_time_prioq_remove(s, d);
841                 }
842 
843                 break;
844 
845         case SOURCE_SIGNAL:
846                 if (s->signal.sig > 0) {
847 
848                         if (s->event->signal_sources)
849                                 s->event->signal_sources[s->signal.sig] = NULL;
850 
851                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
852                 }
853 
854                 break;
855 
856         case SOURCE_CHILD:
857                 if (event_pid_changed(s->event))
858                         s->child.process_owned = false;
859 
860                 if (s->child.pid > 0) {
861                         if (event_source_is_online(s)) {
862                                 assert(s->event->n_online_child_sources > 0);
863                                 s->event->n_online_child_sources--;
864                         }
865 
866                         (void) hashmap_remove(s->event->child_sources, PID_TO_PTR(s->child.pid));
867                 }
868 
869                 if (EVENT_SOURCE_WATCH_PIDFD(s))
870                         source_child_pidfd_unregister(s);
871                 else
872                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
873 
874                 break;
875 
876         case SOURCE_DEFER:
877                 /* nothing */
878                 break;
879 
880         case SOURCE_POST:
881                 set_remove(s->event->post_sources, s);
882                 break;
883 
884         case SOURCE_EXIT:
885                 prioq_remove(s->event->exit, s, &s->exit.prioq_index);
886                 break;
887 
888         case SOURCE_INOTIFY: {
889                 struct inode_data *inode_data;
890 
891                 inode_data = s->inotify.inode_data;
892                 if (inode_data) {
893                         struct inotify_data *inotify_data;
894                         assert_se(inotify_data = inode_data->inotify_data);
895 
896                         /* Detach this event source from the inode object */
897                         LIST_REMOVE(inotify.by_inode_data, inode_data->event_sources, s);
898                         s->inotify.inode_data = NULL;
899 
900                         if (s->pending) {
901                                 assert(inotify_data->n_pending > 0);
902                                 inotify_data->n_pending--;
903                         }
904 
905                         /* Note that we don't reduce the inotify mask for the watch descriptor here if the inode is
906                          * continued to being watched. That's because inotify doesn't really have an API for that: we
907                          * can only change watch masks with access to the original inode either by fd or by path. But
908                          * paths aren't stable, and keeping an O_PATH fd open all the time would mean wasting an fd
909                          * continuously and keeping the mount busy which we can't really do. We could reconstruct the
910                          * original inode from /proc/self/fdinfo/$INOTIFY_FD (as all watch descriptors are listed
911                          * there), but given the need for open_by_handle_at() which is privileged and not universally
912                          * available this would be quite an incomplete solution. Hence we go the other way, leave the
913                          * mask set, even if it is not minimized now, and ignore all events we aren't interested in
914                          * anymore after reception. Yes, this sucks, but … Linux … */
915 
916                         /* Maybe release the inode data (and its inotify) */
917                         event_gc_inode_data(s->event, inode_data);
918                 }
919 
920                 break;
921         }
922 
923         default:
924                 assert_not_reached();
925         }
926 
927         if (s->pending)
928                 prioq_remove(s->event->pending, s, &s->pending_index);
929 
930         if (s->prepare)
931                 prioq_remove(s->event->prepare, s, &s->prepare_index);
932 
933         if (s->ratelimited)
934                 event_source_time_prioq_remove(s, &s->event->monotonic);
935 
936         event = TAKE_PTR(s->event);
937         LIST_REMOVE(sources, event->sources, s);
938         event->n_sources--;
939 
940         /* Note that we don't invalidate the type here, since we still need it in order to close the fd or
941          * pidfd associated with this event source, which we'll do only on source_free(). */
942 
943         if (!s->floating)
944                 sd_event_unref(event);
945 }
946 
source_free(sd_event_source * s)947 static sd_event_source* source_free(sd_event_source *s) {
948         assert(s);
949 
950         source_disconnect(s);
951 
952         if (s->type == SOURCE_IO && s->io.owned)
953                 s->io.fd = safe_close(s->io.fd);
954 
955         if (s->type == SOURCE_CHILD) {
956                 /* Eventually the kernel will do this automatically for us, but for now let's emulate this (unreliably) in userspace. */
957 
958                 if (s->child.process_owned) {
959 
960                         if (!s->child.exited) {
961                                 bool sent = false;
962 
963                                 if (s->child.pidfd >= 0) {
964                                         if (pidfd_send_signal(s->child.pidfd, SIGKILL, NULL, 0) < 0) {
965                                                 if (errno == ESRCH) /* Already dead */
966                                                         sent = true;
967                                                 else if (!ERRNO_IS_NOT_SUPPORTED(errno))
968                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via pidfd_send_signal(), re-trying via kill(): %m",
969                                                                         s->child.pid);
970                                         } else
971                                                 sent = true;
972                                 }
973 
974                                 if (!sent)
975                                         if (kill(s->child.pid, SIGKILL) < 0)
976                                                 if (errno != ESRCH) /* Already dead */
977                                                         log_debug_errno(errno, "Failed to kill process " PID_FMT " via kill(), ignoring: %m",
978                                                                         s->child.pid);
979                         }
980 
981                         if (!s->child.waited) {
982                                 siginfo_t si = {};
983 
984                                 /* Reap the child if we can */
985                                 (void) waitid(P_PID, s->child.pid, &si, WEXITED);
986                         }
987                 }
988 
989                 if (s->child.pidfd_owned)
990                         s->child.pidfd = safe_close(s->child.pidfd);
991         }
992 
993         if (s->destroy_callback)
994                 s->destroy_callback(s->userdata);
995 
996         free(s->description);
997         return mfree(s);
998 }
999 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_event_source*, source_free);
1000 
source_set_pending(sd_event_source * s,bool b)1001 static int source_set_pending(sd_event_source *s, bool b) {
1002         int r;
1003 
1004         assert(s);
1005         assert(s->type != SOURCE_EXIT);
1006 
1007         if (s->pending == b)
1008                 return 0;
1009 
1010         s->pending = b;
1011 
1012         if (b) {
1013                 s->pending_iteration = s->event->iteration;
1014 
1015                 r = prioq_put(s->event->pending, s, &s->pending_index);
1016                 if (r < 0) {
1017                         s->pending = false;
1018                         return r;
1019                 }
1020         } else
1021                 assert_se(prioq_remove(s->event->pending, s, &s->pending_index));
1022 
1023         if (EVENT_SOURCE_IS_TIME(s->type))
1024                 event_source_time_prioq_reshuffle(s);
1025 
1026         if (s->type == SOURCE_SIGNAL && !b) {
1027                 struct signal_data *d;
1028 
1029                 d = hashmap_get(s->event->signal_data, &s->priority);
1030                 if (d && d->current == s)
1031                         d->current = NULL;
1032         }
1033 
1034         if (s->type == SOURCE_INOTIFY) {
1035 
1036                 assert(s->inotify.inode_data);
1037                 assert(s->inotify.inode_data->inotify_data);
1038 
1039                 if (b)
1040                         s->inotify.inode_data->inotify_data->n_pending ++;
1041                 else {
1042                         assert(s->inotify.inode_data->inotify_data->n_pending > 0);
1043                         s->inotify.inode_data->inotify_data->n_pending --;
1044                 }
1045         }
1046 
1047         return 1;
1048 }
1049 
source_new(sd_event * e,bool floating,EventSourceType type)1050 static sd_event_source *source_new(sd_event *e, bool floating, EventSourceType type) {
1051         sd_event_source *s;
1052 
1053         assert(e);
1054 
1055         s = new(sd_event_source, 1);
1056         if (!s)
1057                 return NULL;
1058 
1059         *s = (struct sd_event_source) {
1060                 .n_ref = 1,
1061                 .event = e,
1062                 .floating = floating,
1063                 .type = type,
1064                 .pending_index = PRIOQ_IDX_NULL,
1065                 .prepare_index = PRIOQ_IDX_NULL,
1066         };
1067 
1068         if (!floating)
1069                 sd_event_ref(e);
1070 
1071         LIST_PREPEND(sources, e->sources, s);
1072         e->n_sources++;
1073 
1074         return s;
1075 }
1076 
io_exit_callback(sd_event_source * s,int fd,uint32_t revents,void * userdata)1077 static int io_exit_callback(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
1078         assert(s);
1079 
1080         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1081 }
1082 
sd_event_add_io(sd_event * e,sd_event_source ** ret,int fd,uint32_t events,sd_event_io_handler_t callback,void * userdata)1083 _public_ int sd_event_add_io(
1084                 sd_event *e,
1085                 sd_event_source **ret,
1086                 int fd,
1087                 uint32_t events,
1088                 sd_event_io_handler_t callback,
1089                 void *userdata) {
1090 
1091         _cleanup_(source_freep) sd_event_source *s = NULL;
1092         int r;
1093 
1094         assert_return(e, -EINVAL);
1095         assert_return(e = event_resolve(e), -ENOPKG);
1096         assert_return(fd >= 0, -EBADF);
1097         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
1098         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1099         assert_return(!event_pid_changed(e), -ECHILD);
1100 
1101         if (!callback)
1102                 callback = io_exit_callback;
1103 
1104         s = source_new(e, !ret, SOURCE_IO);
1105         if (!s)
1106                 return -ENOMEM;
1107 
1108         s->wakeup = WAKEUP_EVENT_SOURCE;
1109         s->io.fd = fd;
1110         s->io.events = events;
1111         s->io.callback = callback;
1112         s->userdata = userdata;
1113         s->enabled = SD_EVENT_ON;
1114 
1115         r = source_io_register(s, s->enabled, events);
1116         if (r < 0)
1117                 return r;
1118 
1119         if (ret)
1120                 *ret = s;
1121         TAKE_PTR(s);
1122 
1123         return 0;
1124 }
1125 
initialize_perturb(sd_event * e)1126 static void initialize_perturb(sd_event *e) {
1127         sd_id128_t bootid = {};
1128 
1129         /* When we sleep for longer, we try to realign the wakeup to
1130            the same time within each minute/second/250ms, so that
1131            events all across the system can be coalesced into a single
1132            CPU wakeup. However, let's take some system-specific
1133            randomness for this value, so that in a network of systems
1134            with synced clocks timer events are distributed a
1135            bit. Here, we calculate a perturbation usec offset from the
1136            boot ID. */
1137 
1138         if (_likely_(e->perturb != USEC_INFINITY))
1139                 return;
1140 
1141         if (sd_id128_get_boot(&bootid) >= 0)
1142                 e->perturb = (bootid.qwords[0] ^ bootid.qwords[1]) % USEC_PER_MINUTE;
1143 }
1144 
event_setup_timer_fd(sd_event * e,struct clock_data * d,clockid_t clock)1145 static int event_setup_timer_fd(
1146                 sd_event *e,
1147                 struct clock_data *d,
1148                 clockid_t clock) {
1149 
1150         assert(e);
1151         assert(d);
1152 
1153         if (_likely_(d->fd >= 0))
1154                 return 0;
1155 
1156         _cleanup_close_ int fd = -1;
1157 
1158         fd = timerfd_create(clock, TFD_NONBLOCK|TFD_CLOEXEC);
1159         if (fd < 0)
1160                 return -errno;
1161 
1162         fd = fd_move_above_stdio(fd);
1163 
1164         struct epoll_event ev = {
1165                 .events = EPOLLIN,
1166                 .data.ptr = d,
1167         };
1168 
1169         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, fd, &ev) < 0)
1170                 return -errno;
1171 
1172         d->fd = TAKE_FD(fd);
1173         return 0;
1174 }
1175 
time_exit_callback(sd_event_source * s,uint64_t usec,void * userdata)1176 static int time_exit_callback(sd_event_source *s, uint64_t usec, void *userdata) {
1177         assert(s);
1178 
1179         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1180 }
1181 
setup_clock_data(sd_event * e,struct clock_data * d,clockid_t clock)1182 static int setup_clock_data(sd_event *e, struct clock_data *d, clockid_t clock) {
1183         int r;
1184 
1185         assert(d);
1186 
1187         if (d->fd < 0) {
1188                 r = event_setup_timer_fd(e, d, clock);
1189                 if (r < 0)
1190                         return r;
1191         }
1192 
1193         r = prioq_ensure_allocated(&d->earliest, earliest_time_prioq_compare);
1194         if (r < 0)
1195                 return r;
1196 
1197         r = prioq_ensure_allocated(&d->latest, latest_time_prioq_compare);
1198         if (r < 0)
1199                 return r;
1200 
1201         return 0;
1202 }
1203 
event_source_time_prioq_put(sd_event_source * s,struct clock_data * d)1204 static int event_source_time_prioq_put(
1205                 sd_event_source *s,
1206                 struct clock_data *d) {
1207 
1208         int r;
1209 
1210         assert(s);
1211         assert(d);
1212         assert(EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
1213 
1214         r = prioq_put(d->earliest, s, &s->earliest_index);
1215         if (r < 0)
1216                 return r;
1217 
1218         r = prioq_put(d->latest, s, &s->latest_index);
1219         if (r < 0) {
1220                 assert_se(prioq_remove(d->earliest, s, &s->earliest_index) > 0);
1221                 s->earliest_index = PRIOQ_IDX_NULL;
1222                 return r;
1223         }
1224 
1225         d->needs_rearm = true;
1226         return 0;
1227 }
1228 
sd_event_add_time(sd_event * e,sd_event_source ** ret,clockid_t clock,uint64_t usec,uint64_t accuracy,sd_event_time_handler_t callback,void * userdata)1229 _public_ int sd_event_add_time(
1230                 sd_event *e,
1231                 sd_event_source **ret,
1232                 clockid_t clock,
1233                 uint64_t usec,
1234                 uint64_t accuracy,
1235                 sd_event_time_handler_t callback,
1236                 void *userdata) {
1237 
1238         EventSourceType type;
1239         _cleanup_(source_freep) sd_event_source *s = NULL;
1240         struct clock_data *d;
1241         int r;
1242 
1243         assert_return(e, -EINVAL);
1244         assert_return(e = event_resolve(e), -ENOPKG);
1245         assert_return(accuracy != UINT64_MAX, -EINVAL);
1246         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1247         assert_return(!event_pid_changed(e), -ECHILD);
1248 
1249         if (!clock_supported(clock)) /* Checks whether the kernel supports the clock */
1250                 return -EOPNOTSUPP;
1251 
1252         type = clock_to_event_source_type(clock); /* checks whether sd-event supports this clock */
1253         if (type < 0)
1254                 return -EOPNOTSUPP;
1255 
1256         if (!callback)
1257                 callback = time_exit_callback;
1258 
1259         assert_se(d = event_get_clock_data(e, type));
1260 
1261         r = setup_clock_data(e, d, clock);
1262         if (r < 0)
1263                 return r;
1264 
1265         s = source_new(e, !ret, type);
1266         if (!s)
1267                 return -ENOMEM;
1268 
1269         s->time.next = usec;
1270         s->time.accuracy = accuracy == 0 ? DEFAULT_ACCURACY_USEC : accuracy;
1271         s->time.callback = callback;
1272         s->earliest_index = s->latest_index = PRIOQ_IDX_NULL;
1273         s->userdata = userdata;
1274         s->enabled = SD_EVENT_ONESHOT;
1275 
1276         r = event_source_time_prioq_put(s, d);
1277         if (r < 0)
1278                 return r;
1279 
1280         if (ret)
1281                 *ret = s;
1282         TAKE_PTR(s);
1283 
1284         return 0;
1285 }
1286 
sd_event_add_time_relative(sd_event * e,sd_event_source ** ret,clockid_t clock,uint64_t usec,uint64_t accuracy,sd_event_time_handler_t callback,void * userdata)1287 _public_ int sd_event_add_time_relative(
1288                 sd_event *e,
1289                 sd_event_source **ret,
1290                 clockid_t clock,
1291                 uint64_t usec,
1292                 uint64_t accuracy,
1293                 sd_event_time_handler_t callback,
1294                 void *userdata) {
1295 
1296         usec_t t;
1297         int r;
1298 
1299         /* Same as sd_event_add_time() but operates relative to the event loop's current point in time, and
1300          * checks for overflow. */
1301 
1302         r = sd_event_now(e, clock, &t);
1303         if (r < 0)
1304                 return r;
1305 
1306         if (usec >= USEC_INFINITY - t)
1307                 return -EOVERFLOW;
1308 
1309         return sd_event_add_time(e, ret, clock, t + usec, accuracy, callback, userdata);
1310 }
1311 
signal_exit_callback(sd_event_source * s,const struct signalfd_siginfo * si,void * userdata)1312 static int signal_exit_callback(sd_event_source *s, const struct signalfd_siginfo *si, void *userdata) {
1313         assert(s);
1314 
1315         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1316 }
1317 
sd_event_add_signal(sd_event * e,sd_event_source ** ret,int sig,sd_event_signal_handler_t callback,void * userdata)1318 _public_ int sd_event_add_signal(
1319                 sd_event *e,
1320                 sd_event_source **ret,
1321                 int sig,
1322                 sd_event_signal_handler_t callback,
1323                 void *userdata) {
1324 
1325         _cleanup_(source_freep) sd_event_source *s = NULL;
1326         struct signal_data *d;
1327         int r;
1328 
1329         assert_return(e, -EINVAL);
1330         assert_return(e = event_resolve(e), -ENOPKG);
1331         assert_return(SIGNAL_VALID(sig), -EINVAL);
1332         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1333         assert_return(!event_pid_changed(e), -ECHILD);
1334 
1335         if (!callback)
1336                 callback = signal_exit_callback;
1337 
1338         r = signal_is_blocked(sig);
1339         if (r < 0)
1340                 return r;
1341         if (r == 0)
1342                 return -EBUSY;
1343 
1344         if (!e->signal_sources) {
1345                 e->signal_sources = new0(sd_event_source*, _NSIG);
1346                 if (!e->signal_sources)
1347                         return -ENOMEM;
1348         } else if (e->signal_sources[sig])
1349                 return -EBUSY;
1350 
1351         s = source_new(e, !ret, SOURCE_SIGNAL);
1352         if (!s)
1353                 return -ENOMEM;
1354 
1355         s->signal.sig = sig;
1356         s->signal.callback = callback;
1357         s->userdata = userdata;
1358         s->enabled = SD_EVENT_ON;
1359 
1360         e->signal_sources[sig] = s;
1361 
1362         r = event_make_signal_data(e, sig, &d);
1363         if (r < 0)
1364                 return r;
1365 
1366         /* Use the signal name as description for the event source by default */
1367         (void) sd_event_source_set_description(s, signal_to_string(sig));
1368 
1369         if (ret)
1370                 *ret = s;
1371         TAKE_PTR(s);
1372 
1373         return 0;
1374 }
1375 
child_exit_callback(sd_event_source * s,const siginfo_t * si,void * userdata)1376 static int child_exit_callback(sd_event_source *s, const siginfo_t *si, void *userdata) {
1377         assert(s);
1378 
1379         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1380 }
1381 
shall_use_pidfd(void)1382 static bool shall_use_pidfd(void) {
1383         /* Mostly relevant for debugging, i.e. this is used in test-event.c to test the event loop once with and once without pidfd */
1384         return getenv_bool_secure("SYSTEMD_PIDFD") != 0;
1385 }
1386 
sd_event_add_child(sd_event * e,sd_event_source ** ret,pid_t pid,int options,sd_event_child_handler_t callback,void * userdata)1387 _public_ int sd_event_add_child(
1388                 sd_event *e,
1389                 sd_event_source **ret,
1390                 pid_t pid,
1391                 int options,
1392                 sd_event_child_handler_t callback,
1393                 void *userdata) {
1394 
1395         _cleanup_(source_freep) sd_event_source *s = NULL;
1396         int r;
1397 
1398         assert_return(e, -EINVAL);
1399         assert_return(e = event_resolve(e), -ENOPKG);
1400         assert_return(pid > 1, -EINVAL);
1401         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1402         assert_return(options != 0, -EINVAL);
1403         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1404         assert_return(!event_pid_changed(e), -ECHILD);
1405 
1406         if (!callback)
1407                 callback = child_exit_callback;
1408 
1409         if (e->n_online_child_sources == 0) {
1410                 /* Caller must block SIGCHLD before using us to watch children, even if pidfd is available,
1411                  * for compatibility with pre-pidfd and because we don't want the reap the child processes
1412                  * ourselves, i.e. call waitid(), and don't want Linux' default internal logic for that to
1413                  * take effect.
1414                  *
1415                  * (As an optimization we only do this check on the first child event source created.) */
1416                 r = signal_is_blocked(SIGCHLD);
1417                 if (r < 0)
1418                         return r;
1419                 if (r == 0)
1420                         return -EBUSY;
1421         }
1422 
1423         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1424         if (r < 0)
1425                 return r;
1426 
1427         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1428                 return -EBUSY;
1429 
1430         s = source_new(e, !ret, SOURCE_CHILD);
1431         if (!s)
1432                 return -ENOMEM;
1433 
1434         s->wakeup = WAKEUP_EVENT_SOURCE;
1435         s->child.options = options;
1436         s->child.callback = callback;
1437         s->userdata = userdata;
1438         s->enabled = SD_EVENT_ONESHOT;
1439 
1440         /* We always take a pidfd here if we can, even if we wait for anything else than WEXITED, so that we
1441          * pin the PID, and make regular waitid() handling race-free. */
1442 
1443         if (shall_use_pidfd()) {
1444                 s->child.pidfd = pidfd_open(pid, 0);
1445                 if (s->child.pidfd < 0) {
1446                         /* Propagate errors unless the syscall is not supported or blocked */
1447                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
1448                                 return -errno;
1449                 } else
1450                         s->child.pidfd_owned = true; /* If we allocate the pidfd we own it by default */
1451         } else
1452                 s->child.pidfd = -1;
1453 
1454         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1455                 /* We have a pidfd and we only want to watch for exit */
1456                 r = source_child_pidfd_register(s, s->enabled);
1457                 if (r < 0)
1458                         return r;
1459 
1460         } else {
1461                 /* We have no pidfd or we shall wait for some other event than WEXITED */
1462                 r = event_make_signal_data(e, SIGCHLD, NULL);
1463                 if (r < 0)
1464                         return r;
1465 
1466                 e->need_process_child = true;
1467         }
1468 
1469         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1470         if (r < 0)
1471                 return r;
1472 
1473         /* These must be done after everything succeeds. */
1474         s->child.pid = pid;
1475         e->n_online_child_sources++;
1476 
1477         if (ret)
1478                 *ret = s;
1479         TAKE_PTR(s);
1480         return 0;
1481 }
1482 
sd_event_add_child_pidfd(sd_event * e,sd_event_source ** ret,int pidfd,int options,sd_event_child_handler_t callback,void * userdata)1483 _public_ int sd_event_add_child_pidfd(
1484                 sd_event *e,
1485                 sd_event_source **ret,
1486                 int pidfd,
1487                 int options,
1488                 sd_event_child_handler_t callback,
1489                 void *userdata) {
1490 
1491 
1492         _cleanup_(source_freep) sd_event_source *s = NULL;
1493         pid_t pid;
1494         int r;
1495 
1496         assert_return(e, -EINVAL);
1497         assert_return(e = event_resolve(e), -ENOPKG);
1498         assert_return(pidfd >= 0, -EBADF);
1499         assert_return(!(options & ~(WEXITED|WSTOPPED|WCONTINUED)), -EINVAL);
1500         assert_return(options != 0, -EINVAL);
1501         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1502         assert_return(!event_pid_changed(e), -ECHILD);
1503 
1504         if (!callback)
1505                 callback = child_exit_callback;
1506 
1507         if (e->n_online_child_sources == 0) {
1508                 r = signal_is_blocked(SIGCHLD);
1509                 if (r < 0)
1510                         return r;
1511                 if (r == 0)
1512                         return -EBUSY;
1513         }
1514 
1515         r = hashmap_ensure_allocated(&e->child_sources, NULL);
1516         if (r < 0)
1517                 return r;
1518 
1519         r = pidfd_get_pid(pidfd, &pid);
1520         if (r < 0)
1521                 return r;
1522 
1523         if (hashmap_contains(e->child_sources, PID_TO_PTR(pid)))
1524                 return -EBUSY;
1525 
1526         s = source_new(e, !ret, SOURCE_CHILD);
1527         if (!s)
1528                 return -ENOMEM;
1529 
1530         s->wakeup = WAKEUP_EVENT_SOURCE;
1531         s->child.pidfd = pidfd;
1532         s->child.pid = pid;
1533         s->child.options = options;
1534         s->child.callback = callback;
1535         s->child.pidfd_owned = false; /* If we got the pidfd passed in we don't own it by default (similar to the IO fd case) */
1536         s->userdata = userdata;
1537         s->enabled = SD_EVENT_ONESHOT;
1538 
1539         r = hashmap_put(e->child_sources, PID_TO_PTR(pid), s);
1540         if (r < 0)
1541                 return r;
1542 
1543         if (EVENT_SOURCE_WATCH_PIDFD(s)) {
1544                 /* We only want to watch for WEXITED */
1545                 r = source_child_pidfd_register(s, s->enabled);
1546                 if (r < 0)
1547                         return r;
1548         } else {
1549                 /* We shall wait for some other event than WEXITED */
1550                 r = event_make_signal_data(e, SIGCHLD, NULL);
1551                 if (r < 0)
1552                         return r;
1553 
1554                 e->need_process_child = true;
1555         }
1556 
1557         e->n_online_child_sources++;
1558 
1559         if (ret)
1560                 *ret = s;
1561         TAKE_PTR(s);
1562         return 0;
1563 }
1564 
generic_exit_callback(sd_event_source * s,void * userdata)1565 static int generic_exit_callback(sd_event_source *s, void *userdata) {
1566         assert(s);
1567 
1568         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1569 }
1570 
sd_event_add_defer(sd_event * e,sd_event_source ** ret,sd_event_handler_t callback,void * userdata)1571 _public_ int sd_event_add_defer(
1572                 sd_event *e,
1573                 sd_event_source **ret,
1574                 sd_event_handler_t callback,
1575                 void *userdata) {
1576 
1577         _cleanup_(source_freep) sd_event_source *s = NULL;
1578         int r;
1579 
1580         assert_return(e, -EINVAL);
1581         assert_return(e = event_resolve(e), -ENOPKG);
1582         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1583         assert_return(!event_pid_changed(e), -ECHILD);
1584 
1585         if (!callback)
1586                 callback = generic_exit_callback;
1587 
1588         s = source_new(e, !ret, SOURCE_DEFER);
1589         if (!s)
1590                 return -ENOMEM;
1591 
1592         s->defer.callback = callback;
1593         s->userdata = userdata;
1594         s->enabled = SD_EVENT_ONESHOT;
1595 
1596         r = source_set_pending(s, true);
1597         if (r < 0)
1598                 return r;
1599 
1600         if (ret)
1601                 *ret = s;
1602         TAKE_PTR(s);
1603 
1604         return 0;
1605 }
1606 
sd_event_add_post(sd_event * e,sd_event_source ** ret,sd_event_handler_t callback,void * userdata)1607 _public_ int sd_event_add_post(
1608                 sd_event *e,
1609                 sd_event_source **ret,
1610                 sd_event_handler_t callback,
1611                 void *userdata) {
1612 
1613         _cleanup_(source_freep) sd_event_source *s = NULL;
1614         int r;
1615 
1616         assert_return(e, -EINVAL);
1617         assert_return(e = event_resolve(e), -ENOPKG);
1618         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1619         assert_return(!event_pid_changed(e), -ECHILD);
1620 
1621         if (!callback)
1622                 callback = generic_exit_callback;
1623 
1624         s = source_new(e, !ret, SOURCE_POST);
1625         if (!s)
1626                 return -ENOMEM;
1627 
1628         s->post.callback = callback;
1629         s->userdata = userdata;
1630         s->enabled = SD_EVENT_ON;
1631 
1632         r = set_ensure_put(&e->post_sources, NULL, s);
1633         if (r < 0)
1634                 return r;
1635         assert(r > 0);
1636 
1637         if (ret)
1638                 *ret = s;
1639         TAKE_PTR(s);
1640 
1641         return 0;
1642 }
1643 
sd_event_add_exit(sd_event * e,sd_event_source ** ret,sd_event_handler_t callback,void * userdata)1644 _public_ int sd_event_add_exit(
1645                 sd_event *e,
1646                 sd_event_source **ret,
1647                 sd_event_handler_t callback,
1648                 void *userdata) {
1649 
1650         _cleanup_(source_freep) sd_event_source *s = NULL;
1651         int r;
1652 
1653         assert_return(e, -EINVAL);
1654         assert_return(e = event_resolve(e), -ENOPKG);
1655         assert_return(callback, -EINVAL);
1656         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
1657         assert_return(!event_pid_changed(e), -ECHILD);
1658 
1659         r = prioq_ensure_allocated(&e->exit, exit_prioq_compare);
1660         if (r < 0)
1661                 return r;
1662 
1663         s = source_new(e, !ret, SOURCE_EXIT);
1664         if (!s)
1665                 return -ENOMEM;
1666 
1667         s->exit.callback = callback;
1668         s->userdata = userdata;
1669         s->exit.prioq_index = PRIOQ_IDX_NULL;
1670         s->enabled = SD_EVENT_ONESHOT;
1671 
1672         r = prioq_put(s->event->exit, s, &s->exit.prioq_index);
1673         if (r < 0)
1674                 return r;
1675 
1676         if (ret)
1677                 *ret = s;
1678         TAKE_PTR(s);
1679 
1680         return 0;
1681 }
1682 
event_free_inotify_data(sd_event * e,struct inotify_data * d)1683 static void event_free_inotify_data(sd_event *e, struct inotify_data *d) {
1684         assert(e);
1685 
1686         if (!d)
1687                 return;
1688 
1689         assert(hashmap_isempty(d->inodes));
1690         assert(hashmap_isempty(d->wd));
1691 
1692         if (d->buffer_filled > 0)
1693                 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
1694 
1695         hashmap_free(d->inodes);
1696         hashmap_free(d->wd);
1697 
1698         assert_se(hashmap_remove(e->inotify_data, &d->priority) == d);
1699 
1700         if (d->fd >= 0) {
1701                 if (!event_pid_changed(e) &&
1702                     epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, d->fd, NULL) < 0)
1703                         log_debug_errno(errno, "Failed to remove inotify fd from epoll, ignoring: %m");
1704 
1705                 safe_close(d->fd);
1706         }
1707         free(d);
1708 }
1709 
event_make_inotify_data(sd_event * e,int64_t priority,struct inotify_data ** ret)1710 static int event_make_inotify_data(
1711                 sd_event *e,
1712                 int64_t priority,
1713                 struct inotify_data **ret) {
1714 
1715         _cleanup_close_ int fd = -1;
1716         struct inotify_data *d;
1717         int r;
1718 
1719         assert(e);
1720 
1721         d = hashmap_get(e->inotify_data, &priority);
1722         if (d) {
1723                 if (ret)
1724                         *ret = d;
1725                 return 0;
1726         }
1727 
1728         fd = inotify_init1(IN_NONBLOCK|O_CLOEXEC);
1729         if (fd < 0)
1730                 return -errno;
1731 
1732         fd = fd_move_above_stdio(fd);
1733 
1734         d = new(struct inotify_data, 1);
1735         if (!d)
1736                 return -ENOMEM;
1737 
1738         *d = (struct inotify_data) {
1739                 .wakeup = WAKEUP_INOTIFY_DATA,
1740                 .fd = TAKE_FD(fd),
1741                 .priority = priority,
1742         };
1743 
1744         r = hashmap_ensure_put(&e->inotify_data, &uint64_hash_ops, &d->priority, d);
1745         if (r < 0) {
1746                 d->fd = safe_close(d->fd);
1747                 free(d);
1748                 return r;
1749         }
1750 
1751         struct epoll_event ev = {
1752                 .events = EPOLLIN,
1753                 .data.ptr = d,
1754         };
1755 
1756         if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, d->fd, &ev) < 0) {
1757                 r = -errno;
1758                 d->fd = safe_close(d->fd); /* let's close this ourselves, as event_free_inotify_data() would otherwise
1759                                             * remove the fd from the epoll first, which we don't want as we couldn't
1760                                             * add it in the first place. */
1761                 event_free_inotify_data(e, d);
1762                 return r;
1763         }
1764 
1765         if (ret)
1766                 *ret = d;
1767 
1768         return 1;
1769 }
1770 
inode_data_compare(const struct inode_data * x,const struct inode_data * y)1771 static int inode_data_compare(const struct inode_data *x, const struct inode_data *y) {
1772         int r;
1773 
1774         assert(x);
1775         assert(y);
1776 
1777         r = CMP(x->dev, y->dev);
1778         if (r != 0)
1779                 return r;
1780 
1781         return CMP(x->ino, y->ino);
1782 }
1783 
inode_data_hash_func(const struct inode_data * d,struct siphash * state)1784 static void inode_data_hash_func(const struct inode_data *d, struct siphash *state) {
1785         assert(d);
1786 
1787         siphash24_compress(&d->dev, sizeof(d->dev), state);
1788         siphash24_compress(&d->ino, sizeof(d->ino), state);
1789 }
1790 
1791 DEFINE_PRIVATE_HASH_OPS(inode_data_hash_ops, struct inode_data, inode_data_hash_func, inode_data_compare);
1792 
event_free_inode_data(sd_event * e,struct inode_data * d)1793 static void event_free_inode_data(
1794                 sd_event *e,
1795                 struct inode_data *d) {
1796 
1797         assert(e);
1798 
1799         if (!d)
1800                 return;
1801 
1802         assert(!d->event_sources);
1803 
1804         if (d->fd >= 0) {
1805                 LIST_REMOVE(to_close, e->inode_data_to_close, d);
1806                 safe_close(d->fd);
1807         }
1808 
1809         if (d->inotify_data) {
1810 
1811                 if (d->wd >= 0) {
1812                         if (d->inotify_data->fd >= 0 && !event_pid_changed(e)) {
1813                                 /* So here's a problem. At the time this runs the watch descriptor might already be
1814                                  * invalidated, because an IN_IGNORED event might be queued right the moment we enter
1815                                  * the syscall. Hence, whenever we get EINVAL, ignore it entirely, since it's a very
1816                                  * likely case to happen. */
1817 
1818                                 if (inotify_rm_watch(d->inotify_data->fd, d->wd) < 0 && errno != EINVAL)
1819                                         log_debug_errno(errno, "Failed to remove watch descriptor %i from inotify, ignoring: %m", d->wd);
1820                         }
1821 
1822                         assert_se(hashmap_remove(d->inotify_data->wd, INT_TO_PTR(d->wd)) == d);
1823                 }
1824 
1825                 assert_se(hashmap_remove(d->inotify_data->inodes, d) == d);
1826         }
1827 
1828         free(d);
1829 }
1830 
event_gc_inotify_data(sd_event * e,struct inotify_data * d)1831 static void event_gc_inotify_data(
1832                 sd_event *e,
1833                 struct inotify_data *d) {
1834 
1835         assert(e);
1836 
1837         /* GCs the inotify data object if we don't need it anymore. That's the case if we don't want to watch
1838          * any inode with it anymore, which in turn happens if no event source of this priority is interested
1839          * in any inode any longer. That said, we maintain an extra busy counter: if non-zero we'll delay GC
1840          * (under the expectation that the GC is called again once the counter is decremented). */
1841 
1842         if (!d)
1843                 return;
1844 
1845         if (!hashmap_isempty(d->inodes))
1846                 return;
1847 
1848         if (d->n_busy > 0)
1849                 return;
1850 
1851         event_free_inotify_data(e, d);
1852 }
1853 
event_gc_inode_data(sd_event * e,struct inode_data * d)1854 static void event_gc_inode_data(
1855                 sd_event *e,
1856                 struct inode_data *d) {
1857 
1858         struct inotify_data *inotify_data;
1859 
1860         assert(e);
1861 
1862         if (!d)
1863                 return;
1864 
1865         if (d->event_sources)
1866                 return;
1867 
1868         inotify_data = d->inotify_data;
1869         event_free_inode_data(e, d);
1870 
1871         event_gc_inotify_data(e, inotify_data);
1872 }
1873 
event_make_inode_data(sd_event * e,struct inotify_data * inotify_data,dev_t dev,ino_t ino,struct inode_data ** ret)1874 static int event_make_inode_data(
1875                 sd_event *e,
1876                 struct inotify_data *inotify_data,
1877                 dev_t dev,
1878                 ino_t ino,
1879                 struct inode_data **ret) {
1880 
1881         struct inode_data *d, key;
1882         int r;
1883 
1884         assert(e);
1885         assert(inotify_data);
1886 
1887         key = (struct inode_data) {
1888                 .ino = ino,
1889                 .dev = dev,
1890         };
1891 
1892         d = hashmap_get(inotify_data->inodes, &key);
1893         if (d) {
1894                 if (ret)
1895                         *ret = d;
1896 
1897                 return 0;
1898         }
1899 
1900         r = hashmap_ensure_allocated(&inotify_data->inodes, &inode_data_hash_ops);
1901         if (r < 0)
1902                 return r;
1903 
1904         d = new(struct inode_data, 1);
1905         if (!d)
1906                 return -ENOMEM;
1907 
1908         *d = (struct inode_data) {
1909                 .dev = dev,
1910                 .ino = ino,
1911                 .wd = -1,
1912                 .fd = -1,
1913                 .inotify_data = inotify_data,
1914         };
1915 
1916         r = hashmap_put(inotify_data->inodes, d, d);
1917         if (r < 0) {
1918                 free(d);
1919                 return r;
1920         }
1921 
1922         if (ret)
1923                 *ret = d;
1924 
1925         return 1;
1926 }
1927 
inode_data_determine_mask(struct inode_data * d)1928 static uint32_t inode_data_determine_mask(struct inode_data *d) {
1929         bool excl_unlink = true;
1930         uint32_t combined = 0;
1931 
1932         assert(d);
1933 
1934         /* Combines the watch masks of all event sources watching this inode. We generally just OR them together, but
1935          * the IN_EXCL_UNLINK flag is ANDed instead.
1936          *
1937          * Note that we add all sources to the mask here, regardless whether enabled, disabled or oneshot. That's
1938          * because we cannot change the mask anymore after the event source was created once, since the kernel has no
1939          * API for that. Hence we need to subscribe to the maximum mask we ever might be interested in, and suppress
1940          * events we don't care for client-side. */
1941 
1942         LIST_FOREACH(inotify.by_inode_data, s, d->event_sources) {
1943 
1944                 if ((s->inotify.mask & IN_EXCL_UNLINK) == 0)
1945                         excl_unlink = false;
1946 
1947                 combined |= s->inotify.mask;
1948         }
1949 
1950         return (combined & ~(IN_ONESHOT|IN_DONT_FOLLOW|IN_ONLYDIR|IN_EXCL_UNLINK)) | (excl_unlink ? IN_EXCL_UNLINK : 0);
1951 }
1952 
inode_data_realize_watch(sd_event * e,struct inode_data * d)1953 static int inode_data_realize_watch(sd_event *e, struct inode_data *d) {
1954         uint32_t combined_mask;
1955         int wd, r;
1956 
1957         assert(d);
1958         assert(d->fd >= 0);
1959 
1960         combined_mask = inode_data_determine_mask(d);
1961 
1962         if (d->wd >= 0 && combined_mask == d->combined_mask)
1963                 return 0;
1964 
1965         r = hashmap_ensure_allocated(&d->inotify_data->wd, NULL);
1966         if (r < 0)
1967                 return r;
1968 
1969         wd = inotify_add_watch_fd(d->inotify_data->fd, d->fd, combined_mask);
1970         if (wd < 0)
1971                 return -errno;
1972 
1973         if (d->wd < 0) {
1974                 r = hashmap_put(d->inotify_data->wd, INT_TO_PTR(wd), d);
1975                 if (r < 0) {
1976                         (void) inotify_rm_watch(d->inotify_data->fd, wd);
1977                         return r;
1978                 }
1979 
1980                 d->wd = wd;
1981 
1982         } else if (d->wd != wd) {
1983 
1984                 log_debug("Weird, the watch descriptor we already knew for this inode changed?");
1985                 (void) inotify_rm_watch(d->fd, wd);
1986                 return -EINVAL;
1987         }
1988 
1989         d->combined_mask = combined_mask;
1990         return 1;
1991 }
1992 
inotify_exit_callback(sd_event_source * s,const struct inotify_event * event,void * userdata)1993 static int inotify_exit_callback(sd_event_source *s, const struct inotify_event *event, void *userdata) {
1994         assert(s);
1995 
1996         return sd_event_exit(sd_event_source_get_event(s), PTR_TO_INT(userdata));
1997 }
1998 
event_add_inotify_fd_internal(sd_event * e,sd_event_source ** ret,int fd,bool donate,uint32_t mask,sd_event_inotify_handler_t callback,void * userdata)1999 static int event_add_inotify_fd_internal(
2000                 sd_event *e,
2001                 sd_event_source **ret,
2002                 int fd,
2003                 bool donate,
2004                 uint32_t mask,
2005                 sd_event_inotify_handler_t callback,
2006                 void *userdata) {
2007 
2008         _cleanup_close_ int donated_fd = donate ? fd : -1;
2009         _cleanup_(source_freep) sd_event_source *s = NULL;
2010         struct inotify_data *inotify_data = NULL;
2011         struct inode_data *inode_data = NULL;
2012         struct stat st;
2013         int r;
2014 
2015         assert_return(e, -EINVAL);
2016         assert_return(e = event_resolve(e), -ENOPKG);
2017         assert_return(fd >= 0, -EBADF);
2018         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
2019         assert_return(!event_pid_changed(e), -ECHILD);
2020 
2021         if (!callback)
2022                 callback = inotify_exit_callback;
2023 
2024         /* Refuse IN_MASK_ADD since we coalesce watches on the same inode, and hence really don't want to merge
2025          * masks. Or in other words, this whole code exists only to manage IN_MASK_ADD type operations for you, hence
2026          * the user can't use them for us. */
2027         if (mask & IN_MASK_ADD)
2028                 return -EINVAL;
2029 
2030         if (fstat(fd, &st) < 0)
2031                 return -errno;
2032 
2033         s = source_new(e, !ret, SOURCE_INOTIFY);
2034         if (!s)
2035                 return -ENOMEM;
2036 
2037         s->enabled = mask & IN_ONESHOT ? SD_EVENT_ONESHOT : SD_EVENT_ON;
2038         s->inotify.mask = mask;
2039         s->inotify.callback = callback;
2040         s->userdata = userdata;
2041 
2042         /* Allocate an inotify object for this priority, and an inode object within it */
2043         r = event_make_inotify_data(e, SD_EVENT_PRIORITY_NORMAL, &inotify_data);
2044         if (r < 0)
2045                 return r;
2046 
2047         r = event_make_inode_data(e, inotify_data, st.st_dev, st.st_ino, &inode_data);
2048         if (r < 0) {
2049                 event_gc_inotify_data(e, inotify_data);
2050                 return r;
2051         }
2052 
2053         /* Keep the O_PATH fd around until the first iteration of the loop, so that we can still change the priority of
2054          * the event source, until then, for which we need the original inode. */
2055         if (inode_data->fd < 0) {
2056                 if (donated_fd >= 0)
2057                         inode_data->fd = TAKE_FD(donated_fd);
2058                 else {
2059                         inode_data->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
2060                         if (inode_data->fd < 0) {
2061                                 r = -errno;
2062                                 event_gc_inode_data(e, inode_data);
2063                                 return r;
2064                         }
2065                 }
2066 
2067                 LIST_PREPEND(to_close, e->inode_data_to_close, inode_data);
2068         }
2069 
2070         /* Link our event source to the inode data object */
2071         LIST_PREPEND(inotify.by_inode_data, inode_data->event_sources, s);
2072         s->inotify.inode_data = inode_data;
2073 
2074         /* Actually realize the watch now */
2075         r = inode_data_realize_watch(e, inode_data);
2076         if (r < 0)
2077                 return r;
2078 
2079         if (ret)
2080                 *ret = s;
2081         TAKE_PTR(s);
2082 
2083         return 0;
2084 }
2085 
sd_event_add_inotify_fd(sd_event * e,sd_event_source ** ret,int fd,uint32_t mask,sd_event_inotify_handler_t callback,void * userdata)2086 _public_ int sd_event_add_inotify_fd(
2087                 sd_event *e,
2088                 sd_event_source **ret,
2089                 int fd,
2090                 uint32_t mask,
2091                 sd_event_inotify_handler_t callback,
2092                 void *userdata) {
2093 
2094         return event_add_inotify_fd_internal(e, ret, fd, /* donate= */ false, mask, callback, userdata);
2095 }
2096 
sd_event_add_inotify(sd_event * e,sd_event_source ** ret,const char * path,uint32_t mask,sd_event_inotify_handler_t callback,void * userdata)2097 _public_ int sd_event_add_inotify(
2098                 sd_event *e,
2099                 sd_event_source **ret,
2100                 const char *path,
2101                 uint32_t mask,
2102                 sd_event_inotify_handler_t callback,
2103                 void *userdata) {
2104 
2105         sd_event_source *s = NULL; /* avoid false maybe-uninitialized warning */
2106         int fd, r;
2107 
2108         assert_return(path, -EINVAL);
2109 
2110         fd = open(path, O_PATH|O_CLOEXEC|
2111                   (mask & IN_ONLYDIR ? O_DIRECTORY : 0)|
2112                   (mask & IN_DONT_FOLLOW ? O_NOFOLLOW : 0));
2113         if (fd < 0)
2114                 return -errno;
2115 
2116         r = event_add_inotify_fd_internal(e, &s, fd, /* donate= */ true, mask, callback, userdata);
2117         if (r < 0)
2118                 return r;
2119 
2120         (void) sd_event_source_set_description(s, path);
2121 
2122         if (ret)
2123                 *ret = s;
2124 
2125         return r;
2126 }
2127 
event_source_free(sd_event_source * s)2128 static sd_event_source* event_source_free(sd_event_source *s) {
2129         if (!s)
2130                 return NULL;
2131 
2132         /* Here's a special hack: when we are called from a
2133          * dispatch handler we won't free the event source
2134          * immediately, but we will detach the fd from the
2135          * epoll. This way it is safe for the caller to unref
2136          * the event source and immediately close the fd, but
2137          * we still retain a valid event source object after
2138          * the callback. */
2139 
2140         if (s->dispatching) {
2141                 if (s->type == SOURCE_IO)
2142                         source_io_unregister(s);
2143 
2144                 source_disconnect(s);
2145         } else
2146                 source_free(s);
2147 
2148         return NULL;
2149 }
2150 
2151 DEFINE_PUBLIC_TRIVIAL_REF_UNREF_FUNC(sd_event_source, sd_event_source, event_source_free);
2152 
sd_event_source_set_description(sd_event_source * s,const char * description)2153 _public_ int sd_event_source_set_description(sd_event_source *s, const char *description) {
2154         assert_return(s, -EINVAL);
2155         assert_return(!event_pid_changed(s->event), -ECHILD);
2156 
2157         return free_and_strdup(&s->description, description);
2158 }
2159 
sd_event_source_get_description(sd_event_source * s,const char ** description)2160 _public_ int sd_event_source_get_description(sd_event_source *s, const char **description) {
2161         assert_return(s, -EINVAL);
2162         assert_return(description, -EINVAL);
2163         assert_return(!event_pid_changed(s->event), -ECHILD);
2164 
2165         if (!s->description)
2166                 return -ENXIO;
2167 
2168         *description = s->description;
2169         return 0;
2170 }
2171 
sd_event_source_get_event(sd_event_source * s)2172 _public_ sd_event *sd_event_source_get_event(sd_event_source *s) {
2173         assert_return(s, NULL);
2174 
2175         return s->event;
2176 }
2177 
sd_event_source_get_pending(sd_event_source * s)2178 _public_ int sd_event_source_get_pending(sd_event_source *s) {
2179         assert_return(s, -EINVAL);
2180         assert_return(s->type != SOURCE_EXIT, -EDOM);
2181         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2182         assert_return(!event_pid_changed(s->event), -ECHILD);
2183 
2184         return s->pending;
2185 }
2186 
sd_event_source_get_io_fd(sd_event_source * s)2187 _public_ int sd_event_source_get_io_fd(sd_event_source *s) {
2188         assert_return(s, -EINVAL);
2189         assert_return(s->type == SOURCE_IO, -EDOM);
2190         assert_return(!event_pid_changed(s->event), -ECHILD);
2191 
2192         return s->io.fd;
2193 }
2194 
sd_event_source_set_io_fd(sd_event_source * s,int fd)2195 _public_ int sd_event_source_set_io_fd(sd_event_source *s, int fd) {
2196         int r;
2197 
2198         assert_return(s, -EINVAL);
2199         assert_return(fd >= 0, -EBADF);
2200         assert_return(s->type == SOURCE_IO, -EDOM);
2201         assert_return(!event_pid_changed(s->event), -ECHILD);
2202 
2203         if (s->io.fd == fd)
2204                 return 0;
2205 
2206         if (event_source_is_offline(s)) {
2207                 s->io.fd = fd;
2208                 s->io.registered = false;
2209         } else {
2210                 int saved_fd;
2211 
2212                 saved_fd = s->io.fd;
2213                 assert(s->io.registered);
2214 
2215                 s->io.fd = fd;
2216                 s->io.registered = false;
2217 
2218                 r = source_io_register(s, s->enabled, s->io.events);
2219                 if (r < 0) {
2220                         s->io.fd = saved_fd;
2221                         s->io.registered = true;
2222                         return r;
2223                 }
2224 
2225                 (void) epoll_ctl(s->event->epoll_fd, EPOLL_CTL_DEL, saved_fd, NULL);
2226         }
2227 
2228         return 0;
2229 }
2230 
sd_event_source_get_io_fd_own(sd_event_source * s)2231 _public_ int sd_event_source_get_io_fd_own(sd_event_source *s) {
2232         assert_return(s, -EINVAL);
2233         assert_return(s->type == SOURCE_IO, -EDOM);
2234 
2235         return s->io.owned;
2236 }
2237 
sd_event_source_set_io_fd_own(sd_event_source * s,int own)2238 _public_ int sd_event_source_set_io_fd_own(sd_event_source *s, int own) {
2239         assert_return(s, -EINVAL);
2240         assert_return(s->type == SOURCE_IO, -EDOM);
2241 
2242         s->io.owned = own;
2243         return 0;
2244 }
2245 
sd_event_source_get_io_events(sd_event_source * s,uint32_t * events)2246 _public_ int sd_event_source_get_io_events(sd_event_source *s, uint32_t* events) {
2247         assert_return(s, -EINVAL);
2248         assert_return(events, -EINVAL);
2249         assert_return(s->type == SOURCE_IO, -EDOM);
2250         assert_return(!event_pid_changed(s->event), -ECHILD);
2251 
2252         *events = s->io.events;
2253         return 0;
2254 }
2255 
sd_event_source_set_io_events(sd_event_source * s,uint32_t events)2256 _public_ int sd_event_source_set_io_events(sd_event_source *s, uint32_t events) {
2257         int r;
2258 
2259         assert_return(s, -EINVAL);
2260         assert_return(s->type == SOURCE_IO, -EDOM);
2261         assert_return(!(events & ~(EPOLLIN|EPOLLOUT|EPOLLRDHUP|EPOLLPRI|EPOLLERR|EPOLLHUP|EPOLLET)), -EINVAL);
2262         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2263         assert_return(!event_pid_changed(s->event), -ECHILD);
2264 
2265         /* edge-triggered updates are never skipped, so we can reset edges */
2266         if (s->io.events == events && !(events & EPOLLET))
2267                 return 0;
2268 
2269         r = source_set_pending(s, false);
2270         if (r < 0)
2271                 return r;
2272 
2273         if (event_source_is_online(s)) {
2274                 r = source_io_register(s, s->enabled, events);
2275                 if (r < 0)
2276                         return r;
2277         }
2278 
2279         s->io.events = events;
2280 
2281         return 0;
2282 }
2283 
sd_event_source_get_io_revents(sd_event_source * s,uint32_t * revents)2284 _public_ int sd_event_source_get_io_revents(sd_event_source *s, uint32_t* revents) {
2285         assert_return(s, -EINVAL);
2286         assert_return(revents, -EINVAL);
2287         assert_return(s->type == SOURCE_IO, -EDOM);
2288         assert_return(s->pending, -ENODATA);
2289         assert_return(!event_pid_changed(s->event), -ECHILD);
2290 
2291         *revents = s->io.revents;
2292         return 0;
2293 }
2294 
sd_event_source_get_signal(sd_event_source * s)2295 _public_ int sd_event_source_get_signal(sd_event_source *s) {
2296         assert_return(s, -EINVAL);
2297         assert_return(s->type == SOURCE_SIGNAL, -EDOM);
2298         assert_return(!event_pid_changed(s->event), -ECHILD);
2299 
2300         return s->signal.sig;
2301 }
2302 
sd_event_source_get_priority(sd_event_source * s,int64_t * priority)2303 _public_ int sd_event_source_get_priority(sd_event_source *s, int64_t *priority) {
2304         assert_return(s, -EINVAL);
2305         assert_return(!event_pid_changed(s->event), -ECHILD);
2306 
2307         *priority = s->priority;
2308         return 0;
2309 }
2310 
sd_event_source_set_priority(sd_event_source * s,int64_t priority)2311 _public_ int sd_event_source_set_priority(sd_event_source *s, int64_t priority) {
2312         bool rm_inotify = false, rm_inode = false;
2313         struct inotify_data *new_inotify_data = NULL;
2314         struct inode_data *new_inode_data = NULL;
2315         int r;
2316 
2317         assert_return(s, -EINVAL);
2318         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2319         assert_return(!event_pid_changed(s->event), -ECHILD);
2320 
2321         if (s->priority == priority)
2322                 return 0;
2323 
2324         if (s->type == SOURCE_INOTIFY) {
2325                 struct inode_data *old_inode_data;
2326 
2327                 assert(s->inotify.inode_data);
2328                 old_inode_data = s->inotify.inode_data;
2329 
2330                 /* We need the original fd to change the priority. If we don't have it we can't change the priority,
2331                  * anymore. Note that we close any fds when entering the next event loop iteration, i.e. for inotify
2332                  * events we allow priority changes only until the first following iteration. */
2333                 if (old_inode_data->fd < 0)
2334                         return -EOPNOTSUPP;
2335 
2336                 r = event_make_inotify_data(s->event, priority, &new_inotify_data);
2337                 if (r < 0)
2338                         return r;
2339                 rm_inotify = r > 0;
2340 
2341                 r = event_make_inode_data(s->event, new_inotify_data, old_inode_data->dev, old_inode_data->ino, &new_inode_data);
2342                 if (r < 0)
2343                         goto fail;
2344                 rm_inode = r > 0;
2345 
2346                 if (new_inode_data->fd < 0) {
2347                         /* Duplicate the fd for the new inode object if we don't have any yet */
2348                         new_inode_data->fd = fcntl(old_inode_data->fd, F_DUPFD_CLOEXEC, 3);
2349                         if (new_inode_data->fd < 0) {
2350                                 r = -errno;
2351                                 goto fail;
2352                         }
2353 
2354                         LIST_PREPEND(to_close, s->event->inode_data_to_close, new_inode_data);
2355                 }
2356 
2357                 /* Move the event source to the new inode data structure */
2358                 LIST_REMOVE(inotify.by_inode_data, old_inode_data->event_sources, s);
2359                 LIST_PREPEND(inotify.by_inode_data, new_inode_data->event_sources, s);
2360                 s->inotify.inode_data = new_inode_data;
2361 
2362                 /* Now create the new watch */
2363                 r = inode_data_realize_watch(s->event, new_inode_data);
2364                 if (r < 0) {
2365                         /* Move it back */
2366                         LIST_REMOVE(inotify.by_inode_data, new_inode_data->event_sources, s);
2367                         LIST_PREPEND(inotify.by_inode_data, old_inode_data->event_sources, s);
2368                         s->inotify.inode_data = old_inode_data;
2369                         goto fail;
2370                 }
2371 
2372                 s->priority = priority;
2373 
2374                 event_gc_inode_data(s->event, old_inode_data);
2375 
2376         } else if (s->type == SOURCE_SIGNAL && event_source_is_online(s)) {
2377                 struct signal_data *old, *d;
2378 
2379                 /* Move us from the signalfd belonging to the old
2380                  * priority to the signalfd of the new priority */
2381 
2382                 assert_se(old = hashmap_get(s->event->signal_data, &s->priority));
2383 
2384                 s->priority = priority;
2385 
2386                 r = event_make_signal_data(s->event, s->signal.sig, &d);
2387                 if (r < 0) {
2388                         s->priority = old->priority;
2389                         return r;
2390                 }
2391 
2392                 event_unmask_signal_data(s->event, old, s->signal.sig);
2393         } else
2394                 s->priority = priority;
2395 
2396         event_source_pp_prioq_reshuffle(s);
2397 
2398         if (s->type == SOURCE_EXIT)
2399                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2400 
2401         return 0;
2402 
2403 fail:
2404         if (rm_inode)
2405                 event_free_inode_data(s->event, new_inode_data);
2406 
2407         if (rm_inotify)
2408                 event_free_inotify_data(s->event, new_inotify_data);
2409 
2410         return r;
2411 }
2412 
sd_event_source_get_enabled(sd_event_source * s,int * ret)2413 _public_ int sd_event_source_get_enabled(sd_event_source *s, int *ret) {
2414         assert_return(s, -EINVAL);
2415         assert_return(!event_pid_changed(s->event), -ECHILD);
2416 
2417         if (ret)
2418                 *ret = s->enabled;
2419 
2420         return s->enabled != SD_EVENT_OFF;
2421 }
2422 
event_source_offline(sd_event_source * s,int enabled,bool ratelimited)2423 static int event_source_offline(
2424                 sd_event_source *s,
2425                 int enabled,
2426                 bool ratelimited) {
2427 
2428         bool was_offline;
2429         int r;
2430 
2431         assert(s);
2432         assert(enabled == SD_EVENT_OFF || ratelimited);
2433 
2434         /* Unset the pending flag when this event source is disabled */
2435         if (s->enabled != SD_EVENT_OFF &&
2436             enabled == SD_EVENT_OFF &&
2437             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2438                 r = source_set_pending(s, false);
2439                 if (r < 0)
2440                         return r;
2441         }
2442 
2443         was_offline = event_source_is_offline(s);
2444         s->enabled = enabled;
2445         s->ratelimited = ratelimited;
2446 
2447         switch (s->type) {
2448 
2449         case SOURCE_IO:
2450                 source_io_unregister(s);
2451                 break;
2452 
2453         case SOURCE_SIGNAL:
2454                 event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2455                 break;
2456 
2457         case SOURCE_CHILD:
2458                 if (!was_offline) {
2459                         assert(s->event->n_online_child_sources > 0);
2460                         s->event->n_online_child_sources--;
2461                 }
2462 
2463                 if (EVENT_SOURCE_WATCH_PIDFD(s))
2464                         source_child_pidfd_unregister(s);
2465                 else
2466                         event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2467                 break;
2468 
2469         case SOURCE_EXIT:
2470                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2471                 break;
2472 
2473         case SOURCE_TIME_REALTIME:
2474         case SOURCE_TIME_BOOTTIME:
2475         case SOURCE_TIME_MONOTONIC:
2476         case SOURCE_TIME_REALTIME_ALARM:
2477         case SOURCE_TIME_BOOTTIME_ALARM:
2478         case SOURCE_DEFER:
2479         case SOURCE_POST:
2480         case SOURCE_INOTIFY:
2481                 break;
2482 
2483         default:
2484                 assert_not_reached();
2485         }
2486 
2487         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2488         event_source_time_prioq_reshuffle(s);
2489 
2490         return 1;
2491 }
2492 
event_source_online(sd_event_source * s,int enabled,bool ratelimited)2493 static int event_source_online(
2494                 sd_event_source *s,
2495                 int enabled,
2496                 bool ratelimited) {
2497 
2498         bool was_online;
2499         int r;
2500 
2501         assert(s);
2502         assert(enabled != SD_EVENT_OFF || !ratelimited);
2503 
2504         /* Unset the pending flag when this event source is enabled */
2505         if (s->enabled == SD_EVENT_OFF &&
2506             enabled != SD_EVENT_OFF &&
2507             !IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
2508                 r = source_set_pending(s, false);
2509                 if (r < 0)
2510                         return r;
2511         }
2512 
2513         /* Are we really ready for onlining? */
2514         if (enabled == SD_EVENT_OFF || ratelimited) {
2515                 /* Nope, we are not ready for onlining, then just update the precise state and exit */
2516                 s->enabled = enabled;
2517                 s->ratelimited = ratelimited;
2518                 return 0;
2519         }
2520 
2521         was_online = event_source_is_online(s);
2522 
2523         switch (s->type) {
2524         case SOURCE_IO:
2525                 r = source_io_register(s, enabled, s->io.events);
2526                 if (r < 0)
2527                         return r;
2528                 break;
2529 
2530         case SOURCE_SIGNAL:
2531                 r = event_make_signal_data(s->event, s->signal.sig, NULL);
2532                 if (r < 0) {
2533                         event_gc_signal_data(s->event, &s->priority, s->signal.sig);
2534                         return r;
2535                 }
2536 
2537                 break;
2538 
2539         case SOURCE_CHILD:
2540                 if (EVENT_SOURCE_WATCH_PIDFD(s)) {
2541                         /* yes, we have pidfd */
2542 
2543                         r = source_child_pidfd_register(s, enabled);
2544                         if (r < 0)
2545                                 return r;
2546                 } else {
2547                         /* no pidfd, or something other to watch for than WEXITED */
2548 
2549                         r = event_make_signal_data(s->event, SIGCHLD, NULL);
2550                         if (r < 0) {
2551                                 event_gc_signal_data(s->event, &s->priority, SIGCHLD);
2552                                 return r;
2553                         }
2554                 }
2555 
2556                 if (!was_online)
2557                         s->event->n_online_child_sources++;
2558                 break;
2559 
2560         case SOURCE_TIME_REALTIME:
2561         case SOURCE_TIME_BOOTTIME:
2562         case SOURCE_TIME_MONOTONIC:
2563         case SOURCE_TIME_REALTIME_ALARM:
2564         case SOURCE_TIME_BOOTTIME_ALARM:
2565         case SOURCE_EXIT:
2566         case SOURCE_DEFER:
2567         case SOURCE_POST:
2568         case SOURCE_INOTIFY:
2569                 break;
2570 
2571         default:
2572                 assert_not_reached();
2573         }
2574 
2575         s->enabled = enabled;
2576         s->ratelimited = ratelimited;
2577 
2578         /* Non-failing operations below */
2579         if (s->type == SOURCE_EXIT)
2580                 prioq_reshuffle(s->event->exit, s, &s->exit.prioq_index);
2581 
2582         /* Always reshuffle time prioq, as the ratelimited flag may be changed. */
2583         event_source_time_prioq_reshuffle(s);
2584 
2585         return 1;
2586 }
2587 
sd_event_source_set_enabled(sd_event_source * s,int m)2588 _public_ int sd_event_source_set_enabled(sd_event_source *s, int m) {
2589         int r;
2590 
2591         assert_return(s, -EINVAL);
2592         assert_return(IN_SET(m, SD_EVENT_OFF, SD_EVENT_ON, SD_EVENT_ONESHOT), -EINVAL);
2593         assert_return(!event_pid_changed(s->event), -ECHILD);
2594 
2595         /* If we are dead anyway, we are fine with turning off sources, but everything else needs to fail. */
2596         if (s->event->state == SD_EVENT_FINISHED)
2597                 return m == SD_EVENT_OFF ? 0 : -ESTALE;
2598 
2599         if (s->enabled == m) /* No change? */
2600                 return 0;
2601 
2602         if (m == SD_EVENT_OFF)
2603                 r = event_source_offline(s, m, s->ratelimited);
2604         else {
2605                 if (s->enabled != SD_EVENT_OFF) {
2606                         /* Switching from "on" to "oneshot" or back? If that's the case, we can take a shortcut, the
2607                          * event source is already enabled after all. */
2608                         s->enabled = m;
2609                         return 0;
2610                 }
2611 
2612                 r = event_source_online(s, m, s->ratelimited);
2613         }
2614         if (r < 0)
2615                 return r;
2616 
2617         event_source_pp_prioq_reshuffle(s);
2618         return 0;
2619 }
2620 
sd_event_source_get_time(sd_event_source * s,uint64_t * usec)2621 _public_ int sd_event_source_get_time(sd_event_source *s, uint64_t *usec) {
2622         assert_return(s, -EINVAL);
2623         assert_return(usec, -EINVAL);
2624         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2625         assert_return(!event_pid_changed(s->event), -ECHILD);
2626 
2627         *usec = s->time.next;
2628         return 0;
2629 }
2630 
sd_event_source_set_time(sd_event_source * s,uint64_t usec)2631 _public_ int sd_event_source_set_time(sd_event_source *s, uint64_t usec) {
2632         int r;
2633 
2634         assert_return(s, -EINVAL);
2635         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2636         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2637         assert_return(!event_pid_changed(s->event), -ECHILD);
2638 
2639         r = source_set_pending(s, false);
2640         if (r < 0)
2641                 return r;
2642 
2643         s->time.next = usec;
2644 
2645         event_source_time_prioq_reshuffle(s);
2646         return 0;
2647 }
2648 
sd_event_source_set_time_relative(sd_event_source * s,uint64_t usec)2649 _public_ int sd_event_source_set_time_relative(sd_event_source *s, uint64_t usec) {
2650         usec_t t;
2651         int r;
2652 
2653         assert_return(s, -EINVAL);
2654         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2655 
2656         r = sd_event_now(s->event, event_source_type_to_clock(s->type), &t);
2657         if (r < 0)
2658                 return r;
2659 
2660         usec = usec_add(t, usec);
2661         if (usec == USEC_INFINITY)
2662                 return -EOVERFLOW;
2663 
2664         return sd_event_source_set_time(s, usec);
2665 }
2666 
sd_event_source_get_time_accuracy(sd_event_source * s,uint64_t * usec)2667 _public_ int sd_event_source_get_time_accuracy(sd_event_source *s, uint64_t *usec) {
2668         assert_return(s, -EINVAL);
2669         assert_return(usec, -EINVAL);
2670         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2671         assert_return(!event_pid_changed(s->event), -ECHILD);
2672 
2673         *usec = s->time.accuracy;
2674         return 0;
2675 }
2676 
sd_event_source_set_time_accuracy(sd_event_source * s,uint64_t usec)2677 _public_ int sd_event_source_set_time_accuracy(sd_event_source *s, uint64_t usec) {
2678         int r;
2679 
2680         assert_return(s, -EINVAL);
2681         assert_return(usec != UINT64_MAX, -EINVAL);
2682         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2683         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2684         assert_return(!event_pid_changed(s->event), -ECHILD);
2685 
2686         r = source_set_pending(s, false);
2687         if (r < 0)
2688                 return r;
2689 
2690         if (usec == 0)
2691                 usec = DEFAULT_ACCURACY_USEC;
2692 
2693         s->time.accuracy = usec;
2694 
2695         event_source_time_prioq_reshuffle(s);
2696         return 0;
2697 }
2698 
sd_event_source_get_time_clock(sd_event_source * s,clockid_t * clock)2699 _public_ int sd_event_source_get_time_clock(sd_event_source *s, clockid_t *clock) {
2700         assert_return(s, -EINVAL);
2701         assert_return(clock, -EINVAL);
2702         assert_return(EVENT_SOURCE_IS_TIME(s->type), -EDOM);
2703         assert_return(!event_pid_changed(s->event), -ECHILD);
2704 
2705         *clock = event_source_type_to_clock(s->type);
2706         return 0;
2707 }
2708 
sd_event_source_get_child_pid(sd_event_source * s,pid_t * pid)2709 _public_ int sd_event_source_get_child_pid(sd_event_source *s, pid_t *pid) {
2710         assert_return(s, -EINVAL);
2711         assert_return(pid, -EINVAL);
2712         assert_return(s->type == SOURCE_CHILD, -EDOM);
2713         assert_return(!event_pid_changed(s->event), -ECHILD);
2714 
2715         *pid = s->child.pid;
2716         return 0;
2717 }
2718 
sd_event_source_get_child_pidfd(sd_event_source * s)2719 _public_ int sd_event_source_get_child_pidfd(sd_event_source *s) {
2720         assert_return(s, -EINVAL);
2721         assert_return(s->type == SOURCE_CHILD, -EDOM);
2722         assert_return(!event_pid_changed(s->event), -ECHILD);
2723 
2724         if (s->child.pidfd < 0)
2725                 return -EOPNOTSUPP;
2726 
2727         return s->child.pidfd;
2728 }
2729 
sd_event_source_send_child_signal(sd_event_source * s,int sig,const siginfo_t * si,unsigned flags)2730 _public_ int sd_event_source_send_child_signal(sd_event_source *s, int sig, const siginfo_t *si, unsigned flags) {
2731         assert_return(s, -EINVAL);
2732         assert_return(s->type == SOURCE_CHILD, -EDOM);
2733         assert_return(!event_pid_changed(s->event), -ECHILD);
2734         assert_return(SIGNAL_VALID(sig), -EINVAL);
2735 
2736         /* If we already have seen indication the process exited refuse sending a signal early. This way we
2737          * can be sure we don't accidentally kill the wrong process on PID reuse when pidfds are not
2738          * available. */
2739         if (s->child.exited)
2740                 return -ESRCH;
2741 
2742         if (s->child.pidfd >= 0) {
2743                 siginfo_t copy;
2744 
2745                 /* pidfd_send_signal() changes the siginfo_t argument. This is weird, let's hence copy the
2746                  * structure here */
2747                 if (si)
2748                         copy = *si;
2749 
2750                 if (pidfd_send_signal(s->child.pidfd, sig, si ? &copy : NULL, 0) < 0) {
2751                         /* Let's propagate the error only if the system call is not implemented or prohibited */
2752                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
2753                                 return -errno;
2754                 } else
2755                         return 0;
2756         }
2757 
2758         /* Flags are only supported for pidfd_send_signal(), not for rt_sigqueueinfo(), hence let's refuse
2759          * this here. */
2760         if (flags != 0)
2761                 return -EOPNOTSUPP;
2762 
2763         if (si) {
2764                 /* We use rt_sigqueueinfo() only if siginfo_t is specified. */
2765                 siginfo_t copy = *si;
2766 
2767                 if (rt_sigqueueinfo(s->child.pid, sig, &copy) < 0)
2768                         return -errno;
2769         } else if (kill(s->child.pid, sig) < 0)
2770                 return -errno;
2771 
2772         return 0;
2773 }
2774 
sd_event_source_get_child_pidfd_own(sd_event_source * s)2775 _public_ int sd_event_source_get_child_pidfd_own(sd_event_source *s) {
2776         assert_return(s, -EINVAL);
2777         assert_return(s->type == SOURCE_CHILD, -EDOM);
2778 
2779         if (s->child.pidfd < 0)
2780                 return -EOPNOTSUPP;
2781 
2782         return s->child.pidfd_owned;
2783 }
2784 
sd_event_source_set_child_pidfd_own(sd_event_source * s,int own)2785 _public_ int sd_event_source_set_child_pidfd_own(sd_event_source *s, int own) {
2786         assert_return(s, -EINVAL);
2787         assert_return(s->type == SOURCE_CHILD, -EDOM);
2788 
2789         if (s->child.pidfd < 0)
2790                 return -EOPNOTSUPP;
2791 
2792         s->child.pidfd_owned = own;
2793         return 0;
2794 }
2795 
sd_event_source_get_child_process_own(sd_event_source * s)2796 _public_ int sd_event_source_get_child_process_own(sd_event_source *s) {
2797         assert_return(s, -EINVAL);
2798         assert_return(s->type == SOURCE_CHILD, -EDOM);
2799 
2800         return s->child.process_owned;
2801 }
2802 
sd_event_source_set_child_process_own(sd_event_source * s,int own)2803 _public_ int sd_event_source_set_child_process_own(sd_event_source *s, int own) {
2804         assert_return(s, -EINVAL);
2805         assert_return(s->type == SOURCE_CHILD, -EDOM);
2806 
2807         s->child.process_owned = own;
2808         return 0;
2809 }
2810 
sd_event_source_get_inotify_mask(sd_event_source * s,uint32_t * mask)2811 _public_ int sd_event_source_get_inotify_mask(sd_event_source *s, uint32_t *mask) {
2812         assert_return(s, -EINVAL);
2813         assert_return(mask, -EINVAL);
2814         assert_return(s->type == SOURCE_INOTIFY, -EDOM);
2815         assert_return(!event_pid_changed(s->event), -ECHILD);
2816 
2817         *mask = s->inotify.mask;
2818         return 0;
2819 }
2820 
sd_event_source_set_prepare(sd_event_source * s,sd_event_handler_t callback)2821 _public_ int sd_event_source_set_prepare(sd_event_source *s, sd_event_handler_t callback) {
2822         int r;
2823 
2824         assert_return(s, -EINVAL);
2825         assert_return(s->type != SOURCE_EXIT, -EDOM);
2826         assert_return(s->event->state != SD_EVENT_FINISHED, -ESTALE);
2827         assert_return(!event_pid_changed(s->event), -ECHILD);
2828 
2829         if (s->prepare == callback)
2830                 return 0;
2831 
2832         if (callback && s->prepare) {
2833                 s->prepare = callback;
2834                 return 0;
2835         }
2836 
2837         r = prioq_ensure_allocated(&s->event->prepare, prepare_prioq_compare);
2838         if (r < 0)
2839                 return r;
2840 
2841         s->prepare = callback;
2842 
2843         if (callback) {
2844                 r = prioq_put(s->event->prepare, s, &s->prepare_index);
2845                 if (r < 0)
2846                         return r;
2847         } else
2848                 prioq_remove(s->event->prepare, s, &s->prepare_index);
2849 
2850         return 0;
2851 }
2852 
sd_event_source_get_userdata(sd_event_source * s)2853 _public_ void* sd_event_source_get_userdata(sd_event_source *s) {
2854         assert_return(s, NULL);
2855 
2856         return s->userdata;
2857 }
2858 
sd_event_source_set_userdata(sd_event_source * s,void * userdata)2859 _public_ void *sd_event_source_set_userdata(sd_event_source *s, void *userdata) {
2860         void *ret;
2861 
2862         assert_return(s, NULL);
2863 
2864         ret = s->userdata;
2865         s->userdata = userdata;
2866 
2867         return ret;
2868 }
2869 
event_source_enter_ratelimited(sd_event_source * s)2870 static int event_source_enter_ratelimited(sd_event_source *s) {
2871         int r;
2872 
2873         assert(s);
2874 
2875         /* When an event source becomes ratelimited, we place it in the CLOCK_MONOTONIC priority queue, with
2876          * the end of the rate limit time window, much as if it was a timer event source. */
2877 
2878         if (s->ratelimited)
2879                 return 0; /* Already ratelimited, this is a NOP hence */
2880 
2881         /* Make sure we can install a CLOCK_MONOTONIC event further down. */
2882         r = setup_clock_data(s->event, &s->event->monotonic, CLOCK_MONOTONIC);
2883         if (r < 0)
2884                 return r;
2885 
2886         /* Timer event sources are already using the earliest/latest queues for the timer scheduling. Let's
2887          * first remove them from the prioq appropriate for their own clock, so that we can use the prioq
2888          * fields of the event source then for adding it to the CLOCK_MONOTONIC prioq instead. */
2889         if (EVENT_SOURCE_IS_TIME(s->type))
2890                 event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2891 
2892         /* Now, let's add the event source to the monotonic clock instead */
2893         r = event_source_time_prioq_put(s, &s->event->monotonic);
2894         if (r < 0)
2895                 goto fail;
2896 
2897         /* And let's take the event source officially offline */
2898         r = event_source_offline(s, s->enabled, /* ratelimited= */ true);
2899         if (r < 0) {
2900                 event_source_time_prioq_remove(s, &s->event->monotonic);
2901                 goto fail;
2902         }
2903 
2904         event_source_pp_prioq_reshuffle(s);
2905 
2906         log_debug("Event source %p (%s) entered rate limit state.", s, strna(s->description));
2907         return 0;
2908 
2909 fail:
2910         /* Reinstall time event sources in the priority queue as before. This shouldn't fail, since the queue
2911          * space for it should already be allocated. */
2912         if (EVENT_SOURCE_IS_TIME(s->type))
2913                 assert_se(event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type)) >= 0);
2914 
2915         return r;
2916 }
2917 
event_source_leave_ratelimit(sd_event_source * s,bool run_callback)2918 static int event_source_leave_ratelimit(sd_event_source *s, bool run_callback) {
2919         int r;
2920 
2921         assert(s);
2922 
2923         if (!s->ratelimited)
2924                 return 0;
2925 
2926         /* Let's take the event source out of the monotonic prioq first. */
2927         event_source_time_prioq_remove(s, &s->event->monotonic);
2928 
2929         /* Let's then add the event source to its native clock prioq again — if this is a timer event source */
2930         if (EVENT_SOURCE_IS_TIME(s->type)) {
2931                 r = event_source_time_prioq_put(s, event_get_clock_data(s->event, s->type));
2932                 if (r < 0)
2933                         goto fail;
2934         }
2935 
2936         /* Let's try to take it online again.  */
2937         r = event_source_online(s, s->enabled, /* ratelimited= */ false);
2938         if (r < 0) {
2939                 /* Do something roughly sensible when this failed: undo the two prioq ops above */
2940                 if (EVENT_SOURCE_IS_TIME(s->type))
2941                         event_source_time_prioq_remove(s, event_get_clock_data(s->event, s->type));
2942 
2943                 goto fail;
2944         }
2945 
2946         event_source_pp_prioq_reshuffle(s);
2947         ratelimit_reset(&s->rate_limit);
2948 
2949         log_debug("Event source %p (%s) left rate limit state.", s, strna(s->description));
2950 
2951         if (run_callback && s->ratelimit_expire_callback) {
2952                 s->dispatching = true;
2953                 r = s->ratelimit_expire_callback(s, s->userdata);
2954                 s->dispatching = false;
2955 
2956                 if (r < 0) {
2957                         log_debug_errno(r, "Ratelimit expiry callback of event source %s (type %s) returned error, %s: %m",
2958                                         strna(s->description),
2959                                         event_source_type_to_string(s->type),
2960                                         s->exit_on_failure ? "exiting" : "disabling");
2961 
2962                         if (s->exit_on_failure)
2963                                 (void) sd_event_exit(s->event, r);
2964                 }
2965 
2966                 if (s->n_ref == 0)
2967                         source_free(s);
2968                 else if (r < 0)
2969                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
2970 
2971                 return 1;
2972         }
2973 
2974         return 0;
2975 
2976 fail:
2977         /* Do something somewhat reasonable when we cannot move an event sources out of ratelimited mode:
2978          * simply put it back in it, maybe we can then process it more successfully next iteration. */
2979         assert_se(event_source_time_prioq_put(s, &s->event->monotonic) >= 0);
2980 
2981         return r;
2982 }
2983 
sleep_between(sd_event * e,usec_t a,usec_t b)2984 static usec_t sleep_between(sd_event *e, usec_t a, usec_t b) {
2985         usec_t c;
2986         assert(e);
2987         assert(a <= b);
2988 
2989         if (a <= 0)
2990                 return 0;
2991         if (a >= USEC_INFINITY)
2992                 return USEC_INFINITY;
2993 
2994         if (b <= a + 1)
2995                 return a;
2996 
2997         initialize_perturb(e);
2998 
2999         /*
3000           Find a good time to wake up again between times a and b. We
3001           have two goals here:
3002 
3003           a) We want to wake up as seldom as possible, hence prefer
3004              later times over earlier times.
3005 
3006           b) But if we have to wake up, then let's make sure to
3007              dispatch as much as possible on the entire system.
3008 
3009           We implement this by waking up everywhere at the same time
3010           within any given minute if we can, synchronised via the
3011           perturbation value determined from the boot ID. If we can't,
3012           then we try to find the same spot in every 10s, then 1s and
3013           then 250ms step. Otherwise, we pick the last possible time
3014           to wake up.
3015         */
3016 
3017         c = (b / USEC_PER_MINUTE) * USEC_PER_MINUTE + e->perturb;
3018         if (c >= b) {
3019                 if (_unlikely_(c < USEC_PER_MINUTE))
3020                         return b;
3021 
3022                 c -= USEC_PER_MINUTE;
3023         }
3024 
3025         if (c >= a)
3026                 return c;
3027 
3028         c = (b / (USEC_PER_SEC*10)) * (USEC_PER_SEC*10) + (e->perturb % (USEC_PER_SEC*10));
3029         if (c >= b) {
3030                 if (_unlikely_(c < USEC_PER_SEC*10))
3031                         return b;
3032 
3033                 c -= USEC_PER_SEC*10;
3034         }
3035 
3036         if (c >= a)
3037                 return c;
3038 
3039         c = (b / USEC_PER_SEC) * USEC_PER_SEC + (e->perturb % USEC_PER_SEC);
3040         if (c >= b) {
3041                 if (_unlikely_(c < USEC_PER_SEC))
3042                         return b;
3043 
3044                 c -= USEC_PER_SEC;
3045         }
3046 
3047         if (c >= a)
3048                 return c;
3049 
3050         c = (b / (USEC_PER_MSEC*250)) * (USEC_PER_MSEC*250) + (e->perturb % (USEC_PER_MSEC*250));
3051         if (c >= b) {
3052                 if (_unlikely_(c < USEC_PER_MSEC*250))
3053                         return b;
3054 
3055                 c -= USEC_PER_MSEC*250;
3056         }
3057 
3058         if (c >= a)
3059                 return c;
3060 
3061         return b;
3062 }
3063 
event_arm_timer(sd_event * e,struct clock_data * d)3064 static int event_arm_timer(
3065                 sd_event *e,
3066                 struct clock_data *d) {
3067 
3068         struct itimerspec its = {};
3069         sd_event_source *a, *b;
3070         usec_t t;
3071 
3072         assert(e);
3073         assert(d);
3074 
3075         if (!d->needs_rearm)
3076                 return 0;
3077 
3078         d->needs_rearm = false;
3079 
3080         a = prioq_peek(d->earliest);
3081         assert(!a || EVENT_SOURCE_USES_TIME_PRIOQ(a->type));
3082         if (!a || a->enabled == SD_EVENT_OFF || time_event_source_next(a) == USEC_INFINITY) {
3083 
3084                 if (d->fd < 0)
3085                         return 0;
3086 
3087                 if (d->next == USEC_INFINITY)
3088                         return 0;
3089 
3090                 /* disarm */
3091                 if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3092                         return -errno;
3093 
3094                 d->next = USEC_INFINITY;
3095                 return 0;
3096         }
3097 
3098         b = prioq_peek(d->latest);
3099         assert(!b || EVENT_SOURCE_USES_TIME_PRIOQ(b->type));
3100         assert(b && b->enabled != SD_EVENT_OFF);
3101 
3102         t = sleep_between(e, time_event_source_next(a), time_event_source_latest(b));
3103         if (d->next == t)
3104                 return 0;
3105 
3106         assert_se(d->fd >= 0);
3107 
3108         if (t == 0) {
3109                 /* We don' want to disarm here, just mean some time looooong ago. */
3110                 its.it_value.tv_sec = 0;
3111                 its.it_value.tv_nsec = 1;
3112         } else
3113                 timespec_store(&its.it_value, t);
3114 
3115         if (timerfd_settime(d->fd, TFD_TIMER_ABSTIME, &its, NULL) < 0)
3116                 return -errno;
3117 
3118         d->next = t;
3119         return 0;
3120 }
3121 
process_io(sd_event * e,sd_event_source * s,uint32_t revents)3122 static int process_io(sd_event *e, sd_event_source *s, uint32_t revents) {
3123         assert(e);
3124         assert(s);
3125         assert(s->type == SOURCE_IO);
3126 
3127         /* If the event source was already pending, we just OR in the
3128          * new revents, otherwise we reset the value. The ORing is
3129          * necessary to handle EPOLLONESHOT events properly where
3130          * readability might happen independently of writability, and
3131          * we need to keep track of both */
3132 
3133         if (s->pending)
3134                 s->io.revents |= revents;
3135         else
3136                 s->io.revents = revents;
3137 
3138         return source_set_pending(s, true);
3139 }
3140 
flush_timer(sd_event * e,int fd,uint32_t events,usec_t * next)3141 static int flush_timer(sd_event *e, int fd, uint32_t events, usec_t *next) {
3142         uint64_t x;
3143         ssize_t ss;
3144 
3145         assert(e);
3146         assert(fd >= 0);
3147 
3148         assert_return(events == EPOLLIN, -EIO);
3149 
3150         ss = read(fd, &x, sizeof(x));
3151         if (ss < 0) {
3152                 if (ERRNO_IS_TRANSIENT(errno))
3153                         return 0;
3154 
3155                 return -errno;
3156         }
3157 
3158         if (_unlikely_(ss != sizeof(x)))
3159                 return -EIO;
3160 
3161         if (next)
3162                 *next = USEC_INFINITY;
3163 
3164         return 0;
3165 }
3166 
process_timer(sd_event * e,usec_t n,struct clock_data * d)3167 static int process_timer(
3168                 sd_event *e,
3169                 usec_t n,
3170                 struct clock_data *d) {
3171 
3172         sd_event_source *s;
3173         bool callback_invoked = false;
3174         int r;
3175 
3176         assert(e);
3177         assert(d);
3178 
3179         for (;;) {
3180                 s = prioq_peek(d->earliest);
3181                 assert(!s || EVENT_SOURCE_USES_TIME_PRIOQ(s->type));
3182 
3183                 if (!s || time_event_source_next(s) > n)
3184                         break;
3185 
3186                 if (s->ratelimited) {
3187                         /* This is an event sources whose ratelimit window has ended. Let's turn it on
3188                          * again. */
3189                         assert(s->ratelimited);
3190 
3191                         r = event_source_leave_ratelimit(s, /* run_callback */ true);
3192                         if (r < 0)
3193                                 return r;
3194                         else if (r == 1)
3195                                 callback_invoked = true;
3196 
3197                         continue;
3198                 }
3199 
3200                 if (s->enabled == SD_EVENT_OFF || s->pending)
3201                         break;
3202 
3203                 r = source_set_pending(s, true);
3204                 if (r < 0)
3205                         return r;
3206 
3207                 event_source_time_prioq_reshuffle(s);
3208         }
3209 
3210         return callback_invoked;
3211 }
3212 
process_child(sd_event * e,int64_t threshold,int64_t * ret_min_priority)3213 static int process_child(sd_event *e, int64_t threshold, int64_t *ret_min_priority) {
3214         int64_t min_priority = threshold;
3215         bool something_new = false;
3216         sd_event_source *s;
3217         int r;
3218 
3219         assert(e);
3220         assert(ret_min_priority);
3221 
3222         if (!e->need_process_child) {
3223                 *ret_min_priority = min_priority;
3224                 return 0;
3225         }
3226 
3227         e->need_process_child = false;
3228 
3229         /* So, this is ugly. We iteratively invoke waitid() with P_PID + WNOHANG for each PID we wait
3230          * for, instead of using P_ALL. This is because we only want to get child information of very
3231          * specific child processes, and not all of them. We might not have processed the SIGCHLD event
3232          * of a previous invocation and we don't want to maintain a unbounded *per-child* event queue,
3233          * hence we really don't want anything flushed out of the kernel's queue that we don't care
3234          * about. Since this is O(n) this means that if you have a lot of processes you probably want
3235          * to handle SIGCHLD yourself.
3236          *
3237          * We do not reap the children here (by using WNOWAIT), this is only done after the event
3238          * source is dispatched so that the callback still sees the process as a zombie. */
3239 
3240         HASHMAP_FOREACH(s, e->child_sources) {
3241                 assert(s->type == SOURCE_CHILD);
3242 
3243                 if (s->priority > threshold)
3244                         continue;
3245 
3246                 if (s->pending)
3247                         continue;
3248 
3249                 if (event_source_is_offline(s))
3250                         continue;
3251 
3252                 if (s->child.exited)
3253                         continue;
3254 
3255                 if (EVENT_SOURCE_WATCH_PIDFD(s))
3256                         /* There's a usable pidfd known for this event source? Then don't waitid() for
3257                          * it here */
3258                         continue;
3259 
3260                 zero(s->child.siginfo);
3261                 if (waitid(P_PID, s->child.pid, &s->child.siginfo,
3262                            WNOHANG | (s->child.options & WEXITED ? WNOWAIT : 0) | s->child.options) < 0)
3263                         return negative_errno();
3264 
3265                 if (s->child.siginfo.si_pid != 0) {
3266                         bool zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3267 
3268                         if (zombie)
3269                                 s->child.exited = true;
3270 
3271                         if (!zombie && (s->child.options & WEXITED)) {
3272                                 /* If the child isn't dead then let's immediately remove the state
3273                                  * change from the queue, since there's no benefit in leaving it
3274                                  * queued. */
3275 
3276                                 assert(s->child.options & (WSTOPPED|WCONTINUED));
3277                                 (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|(s->child.options & (WSTOPPED|WCONTINUED)));
3278                         }
3279 
3280                         r = source_set_pending(s, true);
3281                         if (r < 0)
3282                                 return r;
3283                         if (r > 0) {
3284                                 something_new = true;
3285                                 min_priority = MIN(min_priority, s->priority);
3286                         }
3287                 }
3288         }
3289 
3290         *ret_min_priority = min_priority;
3291         return something_new;
3292 }
3293 
process_pidfd(sd_event * e,sd_event_source * s,uint32_t revents)3294 static int process_pidfd(sd_event *e, sd_event_source *s, uint32_t revents) {
3295         assert(e);
3296         assert(s);
3297         assert(s->type == SOURCE_CHILD);
3298 
3299         if (s->pending)
3300                 return 0;
3301 
3302         if (event_source_is_offline(s))
3303                 return 0;
3304 
3305         if (!EVENT_SOURCE_WATCH_PIDFD(s))
3306                 return 0;
3307 
3308         zero(s->child.siginfo);
3309         if (waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG | WNOWAIT | s->child.options) < 0)
3310                 return -errno;
3311 
3312         if (s->child.siginfo.si_pid == 0)
3313                 return 0;
3314 
3315         if (IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED))
3316                 s->child.exited = true;
3317 
3318         return source_set_pending(s, true);
3319 }
3320 
process_signal(sd_event * e,struct signal_data * d,uint32_t events,int64_t * min_priority)3321 static int process_signal(sd_event *e, struct signal_data *d, uint32_t events, int64_t *min_priority) {
3322         int r;
3323 
3324         assert(e);
3325         assert(d);
3326         assert_return(events == EPOLLIN, -EIO);
3327         assert(min_priority);
3328 
3329         /* If there's a signal queued on this priority and SIGCHLD is on this priority too, then make
3330          * sure to recheck the children we watch. This is because we only ever dequeue the first signal
3331          * per priority, and if we dequeue one, and SIGCHLD might be enqueued later we wouldn't know,
3332          * but we might have higher priority children we care about hence we need to check that
3333          * explicitly. */
3334 
3335         if (sigismember(&d->sigset, SIGCHLD))
3336                 e->need_process_child = true;
3337 
3338         /* If there's already an event source pending for this priority we don't read another */
3339         if (d->current)
3340                 return 0;
3341 
3342         for (;;) {
3343                 struct signalfd_siginfo si;
3344                 ssize_t n;
3345                 sd_event_source *s = NULL;
3346 
3347                 n = read(d->fd, &si, sizeof(si));
3348                 if (n < 0) {
3349                         if (ERRNO_IS_TRANSIENT(errno))
3350                                 return 0;
3351 
3352                         return -errno;
3353                 }
3354 
3355                 if (_unlikely_(n != sizeof(si)))
3356                         return -EIO;
3357 
3358                 assert(SIGNAL_VALID(si.ssi_signo));
3359 
3360                 if (e->signal_sources)
3361                         s = e->signal_sources[si.ssi_signo];
3362                 if (!s)
3363                         continue;
3364                 if (s->pending)
3365                         continue;
3366 
3367                 s->signal.siginfo = si;
3368                 d->current = s;
3369 
3370                 r = source_set_pending(s, true);
3371                 if (r < 0)
3372                         return r;
3373                 if (r > 0 && *min_priority >= s->priority) {
3374                         *min_priority = s->priority;
3375                         return 1; /* an event source with smaller priority is queued. */
3376                 }
3377 
3378                 return 0;
3379         }
3380 }
3381 
event_inotify_data_read(sd_event * e,struct inotify_data * d,uint32_t revents,int64_t threshold)3382 static int event_inotify_data_read(sd_event *e, struct inotify_data *d, uint32_t revents, int64_t threshold) {
3383         ssize_t n;
3384 
3385         assert(e);
3386         assert(d);
3387 
3388         assert_return(revents == EPOLLIN, -EIO);
3389 
3390         /* If there's already an event source pending for this priority, don't read another */
3391         if (d->n_pending > 0)
3392                 return 0;
3393 
3394         /* Is the read buffer non-empty? If so, let's not read more */
3395         if (d->buffer_filled > 0)
3396                 return 0;
3397 
3398         if (d->priority > threshold)
3399                 return 0;
3400 
3401         n = read(d->fd, &d->buffer, sizeof(d->buffer));
3402         if (n < 0) {
3403                 if (ERRNO_IS_TRANSIENT(errno))
3404                         return 0;
3405 
3406                 return -errno;
3407         }
3408 
3409         assert(n > 0);
3410         d->buffer_filled = (size_t) n;
3411         LIST_PREPEND(buffered, e->inotify_data_buffered, d);
3412 
3413         return 1;
3414 }
3415 
event_inotify_data_drop(sd_event * e,struct inotify_data * d,size_t sz)3416 static void event_inotify_data_drop(sd_event *e, struct inotify_data *d, size_t sz) {
3417         assert(e);
3418         assert(d);
3419         assert(sz <= d->buffer_filled);
3420 
3421         if (sz == 0)
3422                 return;
3423 
3424         /* Move the rest to the buffer to the front, in order to get things properly aligned again */
3425         memmove(d->buffer.raw, d->buffer.raw + sz, d->buffer_filled - sz);
3426         d->buffer_filled -= sz;
3427 
3428         if (d->buffer_filled == 0)
3429                 LIST_REMOVE(buffered, e->inotify_data_buffered, d);
3430 }
3431 
event_inotify_data_process(sd_event * e,struct inotify_data * d)3432 static int event_inotify_data_process(sd_event *e, struct inotify_data *d) {
3433         int r;
3434 
3435         assert(e);
3436         assert(d);
3437 
3438         /* If there's already an event source pending for this priority, don't read another */
3439         if (d->n_pending > 0)
3440                 return 0;
3441 
3442         while (d->buffer_filled > 0) {
3443                 size_t sz;
3444 
3445                 /* Let's validate that the event structures are complete */
3446                 if (d->buffer_filled < offsetof(struct inotify_event, name))
3447                         return -EIO;
3448 
3449                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3450                 if (d->buffer_filled < sz)
3451                         return -EIO;
3452 
3453                 if (d->buffer.ev.mask & IN_Q_OVERFLOW) {
3454                         struct inode_data *inode_data;
3455 
3456                         /* The queue overran, let's pass this event to all event sources connected to this inotify
3457                          * object */
3458 
3459                         HASHMAP_FOREACH(inode_data, d->inodes)
3460                                 LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3461 
3462                                         if (event_source_is_offline(s))
3463                                                 continue;
3464 
3465                                         r = source_set_pending(s, true);
3466                                         if (r < 0)
3467                                                 return r;
3468                                 }
3469                 } else {
3470                         struct inode_data *inode_data;
3471 
3472                         /* Find the inode object for this watch descriptor. If IN_IGNORED is set we also remove it from
3473                          * our watch descriptor table. */
3474                         if (d->buffer.ev.mask & IN_IGNORED) {
3475 
3476                                 inode_data = hashmap_remove(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3477                                 if (!inode_data) {
3478                                         event_inotify_data_drop(e, d, sz);
3479                                         continue;
3480                                 }
3481 
3482                                 /* The watch descriptor was removed by the kernel, let's drop it here too */
3483                                 inode_data->wd = -1;
3484                         } else {
3485                                 inode_data = hashmap_get(d->wd, INT_TO_PTR(d->buffer.ev.wd));
3486                                 if (!inode_data) {
3487                                         event_inotify_data_drop(e, d, sz);
3488                                         continue;
3489                                 }
3490                         }
3491 
3492                         /* Trigger all event sources that are interested in these events. Also trigger all event
3493                          * sources if IN_IGNORED or IN_UNMOUNT is set. */
3494                         LIST_FOREACH(inotify.by_inode_data, s, inode_data->event_sources) {
3495 
3496                                 if (event_source_is_offline(s))
3497                                         continue;
3498 
3499                                 if ((d->buffer.ev.mask & (IN_IGNORED|IN_UNMOUNT)) == 0 &&
3500                                     (s->inotify.mask & d->buffer.ev.mask & IN_ALL_EVENTS) == 0)
3501                                         continue;
3502 
3503                                 r = source_set_pending(s, true);
3504                                 if (r < 0)
3505                                         return r;
3506                         }
3507                 }
3508 
3509                 /* Something pending now? If so, let's finish, otherwise let's read more. */
3510                 if (d->n_pending > 0)
3511                         return 1;
3512         }
3513 
3514         return 0;
3515 }
3516 
process_inotify(sd_event * e)3517 static int process_inotify(sd_event *e) {
3518         int r, done = 0;
3519 
3520         assert(e);
3521 
3522         LIST_FOREACH(buffered, d, e->inotify_data_buffered) {
3523                 r = event_inotify_data_process(e, d);
3524                 if (r < 0)
3525                         return r;
3526                 if (r > 0)
3527                         done ++;
3528         }
3529 
3530         return done;
3531 }
3532 
source_dispatch(sd_event_source * s)3533 static int source_dispatch(sd_event_source *s) {
3534         _cleanup_(sd_event_unrefp) sd_event *saved_event = NULL;
3535         EventSourceType saved_type;
3536         int r = 0;
3537 
3538         assert(s);
3539         assert(s->pending || s->type == SOURCE_EXIT);
3540 
3541         /* Save the event source type, here, so that we still know it after the event callback which might
3542          * invalidate the event. */
3543         saved_type = s->type;
3544 
3545         /* Similarly, store a reference to the event loop object, so that we can still access it after the
3546          * callback might have invalidated/disconnected the event source. */
3547         saved_event = sd_event_ref(s->event);
3548 
3549         /* Check if we hit the ratelimit for this event source, and if so, let's disable it. */
3550         assert(!s->ratelimited);
3551         if (!ratelimit_below(&s->rate_limit)) {
3552                 r = event_source_enter_ratelimited(s);
3553                 if (r < 0)
3554                         return r;
3555 
3556                 return 1;
3557         }
3558 
3559         if (!IN_SET(s->type, SOURCE_DEFER, SOURCE_EXIT)) {
3560                 r = source_set_pending(s, false);
3561                 if (r < 0)
3562                         return r;
3563         }
3564 
3565         if (s->type != SOURCE_POST) {
3566                 sd_event_source *z;
3567 
3568                 /* If we execute a non-post source, let's mark all post sources as pending. */
3569 
3570                 SET_FOREACH(z, s->event->post_sources) {
3571                         if (event_source_is_offline(z))
3572                                 continue;
3573 
3574                         r = source_set_pending(z, true);
3575                         if (r < 0)
3576                                 return r;
3577                 }
3578         }
3579 
3580         if (s->enabled == SD_EVENT_ONESHOT) {
3581                 r = sd_event_source_set_enabled(s, SD_EVENT_OFF);
3582                 if (r < 0)
3583                         return r;
3584         }
3585 
3586         s->dispatching = true;
3587 
3588         switch (s->type) {
3589 
3590         case SOURCE_IO:
3591                 r = s->io.callback(s, s->io.fd, s->io.revents, s->userdata);
3592                 break;
3593 
3594         case SOURCE_TIME_REALTIME:
3595         case SOURCE_TIME_BOOTTIME:
3596         case SOURCE_TIME_MONOTONIC:
3597         case SOURCE_TIME_REALTIME_ALARM:
3598         case SOURCE_TIME_BOOTTIME_ALARM:
3599                 r = s->time.callback(s, s->time.next, s->userdata);
3600                 break;
3601 
3602         case SOURCE_SIGNAL:
3603                 r = s->signal.callback(s, &s->signal.siginfo, s->userdata);
3604                 break;
3605 
3606         case SOURCE_CHILD: {
3607                 bool zombie;
3608 
3609                 zombie = IN_SET(s->child.siginfo.si_code, CLD_EXITED, CLD_KILLED, CLD_DUMPED);
3610 
3611                 r = s->child.callback(s, &s->child.siginfo, s->userdata);
3612 
3613                 /* Now, reap the PID for good. */
3614                 if (zombie) {
3615                         (void) waitid(P_PID, s->child.pid, &s->child.siginfo, WNOHANG|WEXITED);
3616                         s->child.waited = true;
3617                 }
3618 
3619                 break;
3620         }
3621 
3622         case SOURCE_DEFER:
3623                 r = s->defer.callback(s, s->userdata);
3624                 break;
3625 
3626         case SOURCE_POST:
3627                 r = s->post.callback(s, s->userdata);
3628                 break;
3629 
3630         case SOURCE_EXIT:
3631                 r = s->exit.callback(s, s->userdata);
3632                 break;
3633 
3634         case SOURCE_INOTIFY: {
3635                 struct sd_event *e = s->event;
3636                 struct inotify_data *d;
3637                 size_t sz;
3638 
3639                 assert(s->inotify.inode_data);
3640                 assert_se(d = s->inotify.inode_data->inotify_data);
3641 
3642                 assert(d->buffer_filled >= offsetof(struct inotify_event, name));
3643                 sz = offsetof(struct inotify_event, name) + d->buffer.ev.len;
3644                 assert(d->buffer_filled >= sz);
3645 
3646                 /* If the inotify callback destroys the event source then this likely means we don't need to
3647                  * watch the inode anymore, and thus also won't need the inotify object anymore. But if we'd
3648                  * free it immediately, then we couldn't drop the event from the inotify event queue without
3649                  * memory corruption anymore, as below. Hence, let's not free it immediately, but mark it
3650                  * "busy" with a counter (which will ensure it's not GC'ed away prematurely). Let's then
3651                  * explicitly GC it after we are done dropping the inotify event from the buffer. */
3652                 d->n_busy++;
3653                 r = s->inotify.callback(s, &d->buffer.ev, s->userdata);
3654                 d->n_busy--;
3655 
3656                 /* When no event is pending anymore on this inotify object, then let's drop the event from
3657                  * the inotify event queue buffer. */
3658                 if (d->n_pending == 0)
3659                         event_inotify_data_drop(e, d, sz);
3660 
3661                 /* Now we don't want to access 'd' anymore, it's OK to GC now. */
3662                 event_gc_inotify_data(e, d);
3663                 break;
3664         }
3665 
3666         case SOURCE_WATCHDOG:
3667         case _SOURCE_EVENT_SOURCE_TYPE_MAX:
3668         case _SOURCE_EVENT_SOURCE_TYPE_INVALID:
3669                 assert_not_reached();
3670         }
3671 
3672         s->dispatching = false;
3673 
3674         if (r < 0) {
3675                 log_debug_errno(r, "Event source %s (type %s) returned error, %s: %m",
3676                                 strna(s->description),
3677                                 event_source_type_to_string(saved_type),
3678                                 s->exit_on_failure ? "exiting" : "disabling");
3679 
3680                 if (s->exit_on_failure)
3681                         (void) sd_event_exit(saved_event, r);
3682         }
3683 
3684         if (s->n_ref == 0)
3685                 source_free(s);
3686         else if (r < 0)
3687                 assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3688 
3689         return 1;
3690 }
3691 
event_prepare(sd_event * e)3692 static int event_prepare(sd_event *e) {
3693         int r;
3694 
3695         assert(e);
3696 
3697         for (;;) {
3698                 sd_event_source *s;
3699 
3700                 s = prioq_peek(e->prepare);
3701                 if (!s || s->prepare_iteration == e->iteration || event_source_is_offline(s))
3702                         break;
3703 
3704                 s->prepare_iteration = e->iteration;
3705                 r = prioq_reshuffle(e->prepare, s, &s->prepare_index);
3706                 if (r < 0)
3707                         return r;
3708 
3709                 assert(s->prepare);
3710 
3711                 s->dispatching = true;
3712                 r = s->prepare(s, s->userdata);
3713                 s->dispatching = false;
3714 
3715                 if (r < 0) {
3716                         log_debug_errno(r, "Prepare callback of event source %s (type %s) returned error, %s: %m",
3717                                         strna(s->description),
3718                                         event_source_type_to_string(s->type),
3719                                         s->exit_on_failure ? "exiting" : "disabling");
3720 
3721                         if (s->exit_on_failure)
3722                                 (void) sd_event_exit(e, r);
3723                 }
3724 
3725                 if (s->n_ref == 0)
3726                         source_free(s);
3727                 else if (r < 0)
3728                         assert_se(sd_event_source_set_enabled(s, SD_EVENT_OFF) >= 0);
3729         }
3730 
3731         return 0;
3732 }
3733 
dispatch_exit(sd_event * e)3734 static int dispatch_exit(sd_event *e) {
3735         sd_event_source *p;
3736         int r;
3737 
3738         assert(e);
3739 
3740         p = prioq_peek(e->exit);
3741         assert(!p || p->type == SOURCE_EXIT);
3742 
3743         if (!p || event_source_is_offline(p)) {
3744                 e->state = SD_EVENT_FINISHED;
3745                 return 0;
3746         }
3747 
3748         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
3749         e->iteration++;
3750         e->state = SD_EVENT_EXITING;
3751         r = source_dispatch(p);
3752         e->state = SD_EVENT_INITIAL;
3753         return r;
3754 }
3755 
event_next_pending(sd_event * e)3756 static sd_event_source* event_next_pending(sd_event *e) {
3757         sd_event_source *p;
3758 
3759         assert(e);
3760 
3761         p = prioq_peek(e->pending);
3762         if (!p)
3763                 return NULL;
3764 
3765         if (event_source_is_offline(p))
3766                 return NULL;
3767 
3768         return p;
3769 }
3770 
arm_watchdog(sd_event * e)3771 static int arm_watchdog(sd_event *e) {
3772         struct itimerspec its = {};
3773         usec_t t;
3774 
3775         assert(e);
3776         assert(e->watchdog_fd >= 0);
3777 
3778         t = sleep_between(e,
3779                           usec_add(e->watchdog_last, (e->watchdog_period / 2)),
3780                           usec_add(e->watchdog_last, (e->watchdog_period * 3 / 4)));
3781 
3782         timespec_store(&its.it_value, t);
3783 
3784         /* Make sure we never set the watchdog to 0, which tells the
3785          * kernel to disable it. */
3786         if (its.it_value.tv_sec == 0 && its.it_value.tv_nsec == 0)
3787                 its.it_value.tv_nsec = 1;
3788 
3789         return RET_NERRNO(timerfd_settime(e->watchdog_fd, TFD_TIMER_ABSTIME, &its, NULL));
3790 }
3791 
process_watchdog(sd_event * e)3792 static int process_watchdog(sd_event *e) {
3793         assert(e);
3794 
3795         if (!e->watchdog)
3796                 return 0;
3797 
3798         /* Don't notify watchdog too often */
3799         if (e->watchdog_last + e->watchdog_period / 4 > e->timestamp.monotonic)
3800                 return 0;
3801 
3802         sd_notify(false, "WATCHDOG=1");
3803         e->watchdog_last = e->timestamp.monotonic;
3804 
3805         return arm_watchdog(e);
3806 }
3807 
event_close_inode_data_fds(sd_event * e)3808 static void event_close_inode_data_fds(sd_event *e) {
3809         struct inode_data *d;
3810 
3811         assert(e);
3812 
3813         /* Close the fds pointing to the inodes to watch now. We need to close them as they might otherwise pin
3814          * filesystems. But we can't close them right-away as we need them as long as the user still wants to make
3815          * adjustments to the even source, such as changing the priority (which requires us to remove and re-add a watch
3816          * for the inode). Hence, let's close them when entering the first iteration after they were added, as a
3817          * compromise. */
3818 
3819         while ((d = e->inode_data_to_close)) {
3820                 assert(d->fd >= 0);
3821                 d->fd = safe_close(d->fd);
3822 
3823                 LIST_REMOVE(to_close, e->inode_data_to_close, d);
3824         }
3825 }
3826 
sd_event_prepare(sd_event * e)3827 _public_ int sd_event_prepare(sd_event *e) {
3828         int r;
3829 
3830         assert_return(e, -EINVAL);
3831         assert_return(e = event_resolve(e), -ENOPKG);
3832         assert_return(!event_pid_changed(e), -ECHILD);
3833         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
3834         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
3835 
3836         /* Let's check that if we are a default event loop we are executed in the correct thread. We only do
3837          * this check here once, since gettid() is typically not cached, and thus want to minimize
3838          * syscalls */
3839         assert_return(!e->default_event_ptr || e->tid == gettid(), -EREMOTEIO);
3840 
3841         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
3842         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
3843 
3844         if (e->exit_requested)
3845                 goto pending;
3846 
3847         e->iteration++;
3848 
3849         e->state = SD_EVENT_PREPARING;
3850         r = event_prepare(e);
3851         e->state = SD_EVENT_INITIAL;
3852         if (r < 0)
3853                 return r;
3854 
3855         r = event_arm_timer(e, &e->realtime);
3856         if (r < 0)
3857                 return r;
3858 
3859         r = event_arm_timer(e, &e->boottime);
3860         if (r < 0)
3861                 return r;
3862 
3863         r = event_arm_timer(e, &e->monotonic);
3864         if (r < 0)
3865                 return r;
3866 
3867         r = event_arm_timer(e, &e->realtime_alarm);
3868         if (r < 0)
3869                 return r;
3870 
3871         r = event_arm_timer(e, &e->boottime_alarm);
3872         if (r < 0)
3873                 return r;
3874 
3875         event_close_inode_data_fds(e);
3876 
3877         if (event_next_pending(e) || e->need_process_child)
3878                 goto pending;
3879 
3880         e->state = SD_EVENT_ARMED;
3881 
3882         return 0;
3883 
3884 pending:
3885         e->state = SD_EVENT_ARMED;
3886         r = sd_event_wait(e, 0);
3887         if (r == 0)
3888                 e->state = SD_EVENT_ARMED;
3889 
3890         return r;
3891 }
3892 
epoll_wait_usec(int fd,struct epoll_event * events,int maxevents,usec_t timeout)3893 static int epoll_wait_usec(
3894                 int fd,
3895                 struct epoll_event *events,
3896                 int maxevents,
3897                 usec_t timeout) {
3898 
3899         int msec;
3900 #if 0
3901         static bool epoll_pwait2_absent = false;
3902         int r;
3903 
3904         /* A wrapper that uses epoll_pwait2() if available, and falls back to epoll_wait() if not.
3905          *
3906          * FIXME: this is temporarily disabled until epoll_pwait2() becomes more widely available.
3907          * See https://github.com/systemd/systemd/pull/18973 and
3908          * https://github.com/systemd/systemd/issues/19052. */
3909 
3910         if (!epoll_pwait2_absent && timeout != USEC_INFINITY) {
3911                 r = epoll_pwait2(fd,
3912                                  events,
3913                                  maxevents,
3914                                  TIMESPEC_STORE(timeout),
3915                                  NULL);
3916                 if (r >= 0)
3917                         return r;
3918                 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno))
3919                         return -errno; /* Only fallback to old epoll_wait() if the syscall is masked or not
3920                                         * supported. */
3921 
3922                 epoll_pwait2_absent = true;
3923         }
3924 #endif
3925 
3926         if (timeout == USEC_INFINITY)
3927                 msec = -1;
3928         else {
3929                 usec_t k;
3930 
3931                 k = DIV_ROUND_UP(timeout, USEC_PER_MSEC);
3932                 if (k >= INT_MAX)
3933                         msec = INT_MAX; /* Saturate */
3934                 else
3935                         msec = (int) k;
3936         }
3937 
3938         return RET_NERRNO(epoll_wait(fd, events, maxevents, msec));
3939 }
3940 
process_epoll(sd_event * e,usec_t timeout,int64_t threshold,int64_t * ret_min_priority)3941 static int process_epoll(sd_event *e, usec_t timeout, int64_t threshold, int64_t *ret_min_priority) {
3942         size_t n_event_queue, m, n_event_max;
3943         int64_t min_priority = threshold;
3944         bool something_new = false;
3945         int r;
3946 
3947         assert(e);
3948         assert(ret_min_priority);
3949 
3950         n_event_queue = MAX(e->n_sources, 1u);
3951         if (!GREEDY_REALLOC(e->event_queue, n_event_queue))
3952                 return -ENOMEM;
3953 
3954         n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
3955 
3956         /* If we still have inotify data buffered, then query the other fds, but don't wait on it */
3957         if (e->inotify_data_buffered)
3958                 timeout = 0;
3959 
3960         for (;;) {
3961                 r = epoll_wait_usec(
3962                                 e->epoll_fd,
3963                                 e->event_queue,
3964                                 n_event_max,
3965                                 timeout);
3966                 if (r < 0)
3967                         return r;
3968 
3969                 m = (size_t) r;
3970 
3971                 if (m < n_event_max)
3972                         break;
3973 
3974                 if (n_event_max >= n_event_queue * 10)
3975                         break;
3976 
3977                 if (!GREEDY_REALLOC(e->event_queue, n_event_max + n_event_queue))
3978                         return -ENOMEM;
3979 
3980                 n_event_max = MALLOC_ELEMENTSOF(e->event_queue);
3981                 timeout = 0;
3982         }
3983 
3984         /* Set timestamp only when this is called first time. */
3985         if (threshold == INT64_MAX)
3986                 triple_timestamp_get(&e->timestamp);
3987 
3988         for (size_t i = 0; i < m; i++) {
3989 
3990                 if (e->event_queue[i].data.ptr == INT_TO_PTR(SOURCE_WATCHDOG))
3991                         r = flush_timer(e, e->watchdog_fd, e->event_queue[i].events, NULL);
3992                 else {
3993                         WakeupType *t = e->event_queue[i].data.ptr;
3994 
3995                         switch (*t) {
3996 
3997                         case WAKEUP_EVENT_SOURCE: {
3998                                 sd_event_source *s = e->event_queue[i].data.ptr;
3999 
4000                                 assert(s);
4001 
4002                                 if (s->priority > threshold)
4003                                         continue;
4004 
4005                                 min_priority = MIN(min_priority, s->priority);
4006 
4007                                 switch (s->type) {
4008 
4009                                 case SOURCE_IO:
4010                                         r = process_io(e, s, e->event_queue[i].events);
4011                                         break;
4012 
4013                                 case SOURCE_CHILD:
4014                                         r = process_pidfd(e, s, e->event_queue[i].events);
4015                                         break;
4016 
4017                                 default:
4018                                         assert_not_reached();
4019                                 }
4020 
4021                                 break;
4022                         }
4023 
4024                         case WAKEUP_CLOCK_DATA: {
4025                                 struct clock_data *d = e->event_queue[i].data.ptr;
4026 
4027                                 assert(d);
4028 
4029                                 r = flush_timer(e, d->fd, e->event_queue[i].events, &d->next);
4030                                 break;
4031                         }
4032 
4033                         case WAKEUP_SIGNAL_DATA:
4034                                 r = process_signal(e, e->event_queue[i].data.ptr, e->event_queue[i].events, &min_priority);
4035                                 break;
4036 
4037                         case WAKEUP_INOTIFY_DATA:
4038                                 r = event_inotify_data_read(e, e->event_queue[i].data.ptr, e->event_queue[i].events, threshold);
4039                                 break;
4040 
4041                         default:
4042                                 assert_not_reached();
4043                         }
4044                 }
4045                 if (r < 0)
4046                         return r;
4047                 if (r > 0)
4048                         something_new = true;
4049         }
4050 
4051         *ret_min_priority = min_priority;
4052         return something_new;
4053 }
4054 
sd_event_wait(sd_event * e,uint64_t timeout)4055 _public_ int sd_event_wait(sd_event *e, uint64_t timeout) {
4056         int r;
4057 
4058         assert_return(e, -EINVAL);
4059         assert_return(e = event_resolve(e), -ENOPKG);
4060         assert_return(!event_pid_changed(e), -ECHILD);
4061         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4062         assert_return(e->state == SD_EVENT_ARMED, -EBUSY);
4063 
4064         if (e->exit_requested) {
4065                 e->state = SD_EVENT_PENDING;
4066                 return 1;
4067         }
4068 
4069         for (int64_t threshold = INT64_MAX; ; threshold--) {
4070                 int64_t epoll_min_priority, child_min_priority;
4071 
4072                 /* There may be a possibility that new epoll (especially IO) and child events are
4073                  * triggered just after process_epoll() call but before process_child(), and the new IO
4074                  * events may have higher priority than the child events. To salvage these events,
4075                  * let's call epoll_wait() again, but accepts only events with higher priority than the
4076                  * previous. See issue https://github.com/systemd/systemd/issues/18190 and comments
4077                  * https://github.com/systemd/systemd/pull/18750#issuecomment-785801085
4078                  * https://github.com/systemd/systemd/pull/18922#issuecomment-792825226 */
4079 
4080                 r = process_epoll(e, timeout, threshold, &epoll_min_priority);
4081                 if (r == -EINTR) {
4082                         e->state = SD_EVENT_PENDING;
4083                         return 1;
4084                 }
4085                 if (r < 0)
4086                         goto finish;
4087                 if (r == 0 && threshold < INT64_MAX)
4088                         /* No new epoll event. */
4089                         break;
4090 
4091                 r = process_child(e, threshold, &child_min_priority);
4092                 if (r < 0)
4093                         goto finish;
4094                 if (r == 0)
4095                         /* No new child event. */
4096                         break;
4097 
4098                 threshold = MIN(epoll_min_priority, child_min_priority);
4099                 if (threshold == INT64_MIN)
4100                         break;
4101 
4102                 timeout = 0;
4103         }
4104 
4105         r = process_watchdog(e);
4106         if (r < 0)
4107                 goto finish;
4108 
4109         r = process_inotify(e);
4110         if (r < 0)
4111                 goto finish;
4112 
4113         r = process_timer(e, e->timestamp.realtime, &e->realtime);
4114         if (r < 0)
4115                 goto finish;
4116 
4117         r = process_timer(e, e->timestamp.boottime, &e->boottime);
4118         if (r < 0)
4119                 goto finish;
4120 
4121         r = process_timer(e, e->timestamp.realtime, &e->realtime_alarm);
4122         if (r < 0)
4123                 goto finish;
4124 
4125         r = process_timer(e, e->timestamp.boottime, &e->boottime_alarm);
4126         if (r < 0)
4127                 goto finish;
4128 
4129         r = process_timer(e, e->timestamp.monotonic, &e->monotonic);
4130         if (r < 0)
4131                 goto finish;
4132         else if (r == 1) {
4133                 /* Ratelimit expiry callback was called. Let's postpone processing pending sources and
4134                  * put loop in the initial state in order to evaluate (in the next iteration) also sources
4135                  * there were potentially re-enabled by the callback.
4136                  *
4137                  * Wondering why we treat only this invocation of process_timer() differently? Once event
4138                  * source is ratelimited we essentially transform it into CLOCK_MONOTONIC timer hence
4139                  * ratelimit expiry callback is never called for any other timer type. */
4140                 r = 0;
4141                 goto finish;
4142         }
4143 
4144         if (event_next_pending(e)) {
4145                 e->state = SD_EVENT_PENDING;
4146                 return 1;
4147         }
4148 
4149         r = 0;
4150 
4151 finish:
4152         e->state = SD_EVENT_INITIAL;
4153 
4154         return r;
4155 }
4156 
sd_event_dispatch(sd_event * e)4157 _public_ int sd_event_dispatch(sd_event *e) {
4158         sd_event_source *p;
4159         int r;
4160 
4161         assert_return(e, -EINVAL);
4162         assert_return(e = event_resolve(e), -ENOPKG);
4163         assert_return(!event_pid_changed(e), -ECHILD);
4164         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4165         assert_return(e->state == SD_EVENT_PENDING, -EBUSY);
4166 
4167         if (e->exit_requested)
4168                 return dispatch_exit(e);
4169 
4170         p = event_next_pending(e);
4171         if (p) {
4172                 _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4173 
4174                 e->state = SD_EVENT_RUNNING;
4175                 r = source_dispatch(p);
4176                 e->state = SD_EVENT_INITIAL;
4177                 return r;
4178         }
4179 
4180         e->state = SD_EVENT_INITIAL;
4181 
4182         return 1;
4183 }
4184 
event_log_delays(sd_event * e)4185 static void event_log_delays(sd_event *e) {
4186         char b[ELEMENTSOF(e->delays) * DECIMAL_STR_MAX(unsigned) + 1], *p;
4187         size_t l, i;
4188 
4189         p = b;
4190         l = sizeof(b);
4191         for (i = 0; i < ELEMENTSOF(e->delays); i++) {
4192                 l = strpcpyf(&p, l, "%u ", e->delays[i]);
4193                 e->delays[i] = 0;
4194         }
4195         log_debug("Event loop iterations: %s", b);
4196 }
4197 
sd_event_run(sd_event * e,uint64_t timeout)4198 _public_ int sd_event_run(sd_event *e, uint64_t timeout) {
4199         int r;
4200 
4201         assert_return(e, -EINVAL);
4202         assert_return(e = event_resolve(e), -ENOPKG);
4203         assert_return(!event_pid_changed(e), -ECHILD);
4204         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4205         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4206 
4207         if (e->profile_delays && e->last_run_usec != 0) {
4208                 usec_t this_run;
4209                 unsigned l;
4210 
4211                 this_run = now(CLOCK_MONOTONIC);
4212 
4213                 l = log2u64(this_run - e->last_run_usec);
4214                 assert(l < ELEMENTSOF(e->delays));
4215                 e->delays[l]++;
4216 
4217                 if (this_run - e->last_log_usec >= 5*USEC_PER_SEC) {
4218                         event_log_delays(e);
4219                         e->last_log_usec = this_run;
4220                 }
4221         }
4222 
4223         /* Make sure that none of the preparation callbacks ends up freeing the event source under our feet */
4224         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4225 
4226         r = sd_event_prepare(e);
4227         if (r == 0)
4228                 /* There was nothing? Then wait... */
4229                 r = sd_event_wait(e, timeout);
4230 
4231         if (e->profile_delays)
4232                 e->last_run_usec = now(CLOCK_MONOTONIC);
4233 
4234         if (r > 0) {
4235                 /* There's something now, then let's dispatch it */
4236                 r = sd_event_dispatch(e);
4237                 if (r < 0)
4238                         return r;
4239 
4240                 return 1;
4241         }
4242 
4243         return r;
4244 }
4245 
sd_event_loop(sd_event * e)4246 _public_ int sd_event_loop(sd_event *e) {
4247         int r;
4248 
4249         assert_return(e, -EINVAL);
4250         assert_return(e = event_resolve(e), -ENOPKG);
4251         assert_return(!event_pid_changed(e), -ECHILD);
4252         assert_return(e->state == SD_EVENT_INITIAL, -EBUSY);
4253 
4254         _unused_ _cleanup_(sd_event_unrefp) sd_event *ref = sd_event_ref(e);
4255 
4256         while (e->state != SD_EVENT_FINISHED) {
4257                 r = sd_event_run(e, UINT64_MAX);
4258                 if (r < 0)
4259                         return r;
4260         }
4261 
4262         return e->exit_code;
4263 }
4264 
sd_event_get_fd(sd_event * e)4265 _public_ int sd_event_get_fd(sd_event *e) {
4266         assert_return(e, -EINVAL);
4267         assert_return(e = event_resolve(e), -ENOPKG);
4268         assert_return(!event_pid_changed(e), -ECHILD);
4269 
4270         return e->epoll_fd;
4271 }
4272 
sd_event_get_state(sd_event * e)4273 _public_ int sd_event_get_state(sd_event *e) {
4274         assert_return(e, -EINVAL);
4275         assert_return(e = event_resolve(e), -ENOPKG);
4276         assert_return(!event_pid_changed(e), -ECHILD);
4277 
4278         return e->state;
4279 }
4280 
sd_event_get_exit_code(sd_event * e,int * code)4281 _public_ int sd_event_get_exit_code(sd_event *e, int *code) {
4282         assert_return(e, -EINVAL);
4283         assert_return(e = event_resolve(e), -ENOPKG);
4284         assert_return(code, -EINVAL);
4285         assert_return(!event_pid_changed(e), -ECHILD);
4286 
4287         if (!e->exit_requested)
4288                 return -ENODATA;
4289 
4290         *code = e->exit_code;
4291         return 0;
4292 }
4293 
sd_event_exit(sd_event * e,int code)4294 _public_ int sd_event_exit(sd_event *e, int code) {
4295         assert_return(e, -EINVAL);
4296         assert_return(e = event_resolve(e), -ENOPKG);
4297         assert_return(e->state != SD_EVENT_FINISHED, -ESTALE);
4298         assert_return(!event_pid_changed(e), -ECHILD);
4299 
4300         e->exit_requested = true;
4301         e->exit_code = code;
4302 
4303         return 0;
4304 }
4305 
sd_event_now(sd_event * e,clockid_t clock,uint64_t * usec)4306 _public_ int sd_event_now(sd_event *e, clockid_t clock, uint64_t *usec) {
4307         assert_return(e, -EINVAL);
4308         assert_return(e = event_resolve(e), -ENOPKG);
4309         assert_return(usec, -EINVAL);
4310         assert_return(!event_pid_changed(e), -ECHILD);
4311 
4312         if (!TRIPLE_TIMESTAMP_HAS_CLOCK(clock))
4313                 return -EOPNOTSUPP;
4314 
4315         if (!triple_timestamp_is_set(&e->timestamp)) {
4316                 /* Implicitly fall back to now() if we never ran before and thus have no cached time. */
4317                 *usec = now(clock);
4318                 return 1;
4319         }
4320 
4321         *usec = triple_timestamp_by_clock(&e->timestamp, clock);
4322         return 0;
4323 }
4324 
sd_event_default(sd_event ** ret)4325 _public_ int sd_event_default(sd_event **ret) {
4326         sd_event *e = NULL;
4327         int r;
4328 
4329         if (!ret)
4330                 return !!default_event;
4331 
4332         if (default_event) {
4333                 *ret = sd_event_ref(default_event);
4334                 return 0;
4335         }
4336 
4337         r = sd_event_new(&e);
4338         if (r < 0)
4339                 return r;
4340 
4341         e->default_event_ptr = &default_event;
4342         e->tid = gettid();
4343         default_event = e;
4344 
4345         *ret = e;
4346         return 1;
4347 }
4348 
sd_event_get_tid(sd_event * e,pid_t * tid)4349 _public_ int sd_event_get_tid(sd_event *e, pid_t *tid) {
4350         assert_return(e, -EINVAL);
4351         assert_return(e = event_resolve(e), -ENOPKG);
4352         assert_return(tid, -EINVAL);
4353         assert_return(!event_pid_changed(e), -ECHILD);
4354 
4355         if (e->tid != 0) {
4356                 *tid = e->tid;
4357                 return 0;
4358         }
4359 
4360         return -ENXIO;
4361 }
4362 
sd_event_set_watchdog(sd_event * e,int b)4363 _public_ int sd_event_set_watchdog(sd_event *e, int b) {
4364         int r;
4365 
4366         assert_return(e, -EINVAL);
4367         assert_return(e = event_resolve(e), -ENOPKG);
4368         assert_return(!event_pid_changed(e), -ECHILD);
4369 
4370         if (e->watchdog == !!b)
4371                 return e->watchdog;
4372 
4373         if (b) {
4374                 r = sd_watchdog_enabled(false, &e->watchdog_period);
4375                 if (r <= 0)
4376                         return r;
4377 
4378                 /* Issue first ping immediately */
4379                 sd_notify(false, "WATCHDOG=1");
4380                 e->watchdog_last = now(CLOCK_MONOTONIC);
4381 
4382                 e->watchdog_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK|TFD_CLOEXEC);
4383                 if (e->watchdog_fd < 0)
4384                         return -errno;
4385 
4386                 r = arm_watchdog(e);
4387                 if (r < 0)
4388                         goto fail;
4389 
4390                 struct epoll_event ev = {
4391                         .events = EPOLLIN,
4392                         .data.ptr = INT_TO_PTR(SOURCE_WATCHDOG),
4393                 };
4394 
4395                 if (epoll_ctl(e->epoll_fd, EPOLL_CTL_ADD, e->watchdog_fd, &ev) < 0) {
4396                         r = -errno;
4397                         goto fail;
4398                 }
4399 
4400         } else {
4401                 if (e->watchdog_fd >= 0) {
4402                         (void) epoll_ctl(e->epoll_fd, EPOLL_CTL_DEL, e->watchdog_fd, NULL);
4403                         e->watchdog_fd = safe_close(e->watchdog_fd);
4404                 }
4405         }
4406 
4407         e->watchdog = !!b;
4408         return e->watchdog;
4409 
4410 fail:
4411         e->watchdog_fd = safe_close(e->watchdog_fd);
4412         return r;
4413 }
4414 
sd_event_get_watchdog(sd_event * e)4415 _public_ int sd_event_get_watchdog(sd_event *e) {
4416         assert_return(e, -EINVAL);
4417         assert_return(e = event_resolve(e), -ENOPKG);
4418         assert_return(!event_pid_changed(e), -ECHILD);
4419 
4420         return e->watchdog;
4421 }
4422 
sd_event_get_iteration(sd_event * e,uint64_t * ret)4423 _public_ int sd_event_get_iteration(sd_event *e, uint64_t *ret) {
4424         assert_return(e, -EINVAL);
4425         assert_return(e = event_resolve(e), -ENOPKG);
4426         assert_return(!event_pid_changed(e), -ECHILD);
4427 
4428         *ret = e->iteration;
4429         return 0;
4430 }
4431 
sd_event_source_set_destroy_callback(sd_event_source * s,sd_event_destroy_t callback)4432 _public_ int sd_event_source_set_destroy_callback(sd_event_source *s, sd_event_destroy_t callback) {
4433         assert_return(s, -EINVAL);
4434 
4435         s->destroy_callback = callback;
4436         return 0;
4437 }
4438 
sd_event_source_get_destroy_callback(sd_event_source * s,sd_event_destroy_t * ret)4439 _public_ int sd_event_source_get_destroy_callback(sd_event_source *s, sd_event_destroy_t *ret) {
4440         assert_return(s, -EINVAL);
4441 
4442         if (ret)
4443                 *ret = s->destroy_callback;
4444 
4445         return !!s->destroy_callback;
4446 }
4447 
sd_event_source_get_floating(sd_event_source * s)4448 _public_ int sd_event_source_get_floating(sd_event_source *s) {
4449         assert_return(s, -EINVAL);
4450 
4451         return s->floating;
4452 }
4453 
sd_event_source_set_floating(sd_event_source * s,int b)4454 _public_ int sd_event_source_set_floating(sd_event_source *s, int b) {
4455         assert_return(s, -EINVAL);
4456 
4457         if (s->floating == !!b)
4458                 return 0;
4459 
4460         if (!s->event) /* Already disconnected */
4461                 return -ESTALE;
4462 
4463         s->floating = b;
4464 
4465         if (b) {
4466                 sd_event_source_ref(s);
4467                 sd_event_unref(s->event);
4468         } else {
4469                 sd_event_ref(s->event);
4470                 sd_event_source_unref(s);
4471         }
4472 
4473         return 1;
4474 }
4475 
sd_event_source_get_exit_on_failure(sd_event_source * s)4476 _public_ int sd_event_source_get_exit_on_failure(sd_event_source *s) {
4477         assert_return(s, -EINVAL);
4478         assert_return(s->type != SOURCE_EXIT, -EDOM);
4479 
4480         return s->exit_on_failure;
4481 }
4482 
sd_event_source_set_exit_on_failure(sd_event_source * s,int b)4483 _public_ int sd_event_source_set_exit_on_failure(sd_event_source *s, int b) {
4484         assert_return(s, -EINVAL);
4485         assert_return(s->type != SOURCE_EXIT, -EDOM);
4486 
4487         if (s->exit_on_failure == !!b)
4488                 return 0;
4489 
4490         s->exit_on_failure = b;
4491         return 1;
4492 }
4493 
sd_event_source_set_ratelimit(sd_event_source * s,uint64_t interval,unsigned burst)4494 _public_ int sd_event_source_set_ratelimit(sd_event_source *s, uint64_t interval, unsigned burst) {
4495         int r;
4496 
4497         assert_return(s, -EINVAL);
4498 
4499         /* Turning on ratelimiting on event source types that don't support it, is a loggable offense. Doing
4500          * so is a programming error. */
4501         assert_return(EVENT_SOURCE_CAN_RATE_LIMIT(s->type), -EDOM);
4502 
4503         /* When ratelimiting is configured we'll always reset the rate limit state first and start fresh,
4504          * non-ratelimited. */
4505         r = event_source_leave_ratelimit(s, /* run_callback */ false);
4506         if (r < 0)
4507                 return r;
4508 
4509         s->rate_limit = (RateLimit) { interval, burst };
4510         return 0;
4511 }
4512 
sd_event_source_set_ratelimit_expire_callback(sd_event_source * s,sd_event_handler_t callback)4513 _public_ int sd_event_source_set_ratelimit_expire_callback(sd_event_source *s, sd_event_handler_t callback) {
4514         assert_return(s, -EINVAL);
4515 
4516         s->ratelimit_expire_callback = callback;
4517         return 0;
4518 }
4519 
sd_event_source_get_ratelimit(sd_event_source * s,uint64_t * ret_interval,unsigned * ret_burst)4520 _public_ int sd_event_source_get_ratelimit(sd_event_source *s, uint64_t *ret_interval, unsigned *ret_burst) {
4521         assert_return(s, -EINVAL);
4522 
4523         /* Querying whether an event source has ratelimiting configured is not a loggable offsense, hence
4524          * don't use assert_return(). Unlike turning on ratelimiting it's not really a programming error */
4525         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4526                 return -EDOM;
4527 
4528         if (!ratelimit_configured(&s->rate_limit))
4529                 return -ENOEXEC;
4530 
4531         if (ret_interval)
4532                 *ret_interval = s->rate_limit.interval;
4533         if (ret_burst)
4534                 *ret_burst = s->rate_limit.burst;
4535 
4536         return 0;
4537 }
4538 
sd_event_source_is_ratelimited(sd_event_source * s)4539 _public_ int sd_event_source_is_ratelimited(sd_event_source *s) {
4540         assert_return(s, -EINVAL);
4541 
4542         if (!EVENT_SOURCE_CAN_RATE_LIMIT(s->type))
4543                 return false;
4544 
4545         if (!ratelimit_configured(&s->rate_limit))
4546                 return false;
4547 
4548         return s->ratelimited;
4549 }
4550