1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <fcntl.h>
4 #include <sys/stat.h>
5 #include <sys/types.h>
6 #include <unistd.h>
7 
8 #include "alloc-util.h"
9 #include "bpf-program.h"
10 #include "errno-util.h"
11 #include "escape.h"
12 #include "fd-util.h"
13 #include "memory-util.h"
14 #include "missing_syscall.h"
15 #include "path-util.h"
16 #include "serialize.h"
17 #include "string-table.h"
18 
19 static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
20         [BPF_CGROUP_INET_INGRESS] =     "ingress",
21         [BPF_CGROUP_INET_EGRESS] =      "egress",
22         [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
23         [BPF_CGROUP_SOCK_OPS] =         "sock_ops",
24         [BPF_CGROUP_DEVICE] =           "device",
25         [BPF_CGROUP_INET4_BIND] =       "bind4",
26         [BPF_CGROUP_INET6_BIND] =       "bind6",
27         [BPF_CGROUP_INET4_CONNECT] =    "connect4",
28         [BPF_CGROUP_INET6_CONNECT] =    "connect6",
29         [BPF_CGROUP_INET4_POST_BIND] =  "post_bind4",
30         [BPF_CGROUP_INET6_POST_BIND] =  "post_bind6",
31         [BPF_CGROUP_UDP4_SENDMSG] =     "sendmsg4",
32         [BPF_CGROUP_UDP6_SENDMSG] =     "sendmsg6",
33         [BPF_CGROUP_SYSCTL] =           "sysctl",
34         [BPF_CGROUP_UDP4_RECVMSG] =     "recvmsg4",
35         [BPF_CGROUP_UDP6_RECVMSG] =     "recvmsg6",
36         [BPF_CGROUP_GETSOCKOPT] =       "getsockopt",
37         [BPF_CGROUP_SETSOCKOPT] =       "setsockopt",
38 };
39 
40 DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
41 
42 DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
43 
bpf_program_free(BPFProgram * p)44 BPFProgram *bpf_program_free(BPFProgram *p) {
45         if (!p)
46                 return NULL;
47         /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
48          * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
49          * programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with
50          * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
51          * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
52          * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
53          * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
54          * whenever we close the BPF fd. */
55         (void) bpf_program_cgroup_detach(p);
56 
57         safe_close(p->kernel_fd);
58         free(p->prog_name);
59         free(p->instructions);
60         free(p->attached_path);
61 
62         return mfree(p);
63 }
64 
65  /* struct bpf_prog_info info must be initialized since its value is both input and output
66   * for BPF_OBJ_GET_INFO_BY_FD syscall. */
bpf_program_get_info_by_fd(int prog_fd,struct bpf_prog_info * info,uint32_t info_len)67 static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
68         union bpf_attr attr;
69 
70         /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
71          * structured initialization is used.
72          * Refer to https://github.com/systemd/systemd/issues/18164
73          */
74         zero(attr);
75         attr.info.bpf_fd = prog_fd;
76         attr.info.info_len = info_len;
77         attr.info.info = PTR_TO_UINT64(info);
78 
79         return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)));
80 }
81 
bpf_program_new(uint32_t prog_type,const char * prog_name,BPFProgram ** ret)82 int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) {
83         _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
84         _cleanup_free_ char *name = NULL;
85 
86         if (prog_name) {
87                 if (strlen(prog_name) >= BPF_OBJ_NAME_LEN)
88                         return -ENAMETOOLONG;
89 
90                 name = strdup(prog_name);
91                 if (!name)
92                         return -ENOMEM;
93         }
94 
95         p = new(BPFProgram, 1);
96         if (!p)
97                 return -ENOMEM;
98 
99         *p = (BPFProgram) {
100                 .prog_type = prog_type,
101                 .kernel_fd = -1,
102                 .prog_name = TAKE_PTR(name),
103         };
104 
105         *ret = TAKE_PTR(p);
106 
107         return 0;
108 }
109 
bpf_program_new_from_bpffs_path(const char * path,BPFProgram ** ret)110 int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
111         _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
112         struct bpf_prog_info info = {};
113         int r;
114 
115         assert(path);
116         assert(ret);
117 
118         p = new(BPFProgram, 1);
119         if (!p)
120                 return -ENOMEM;
121 
122         *p = (BPFProgram) {
123                 .prog_type = BPF_PROG_TYPE_UNSPEC,
124                 .kernel_fd = -1,
125         };
126 
127         r = bpf_program_load_from_bpf_fs(p, path);
128         if (r < 0)
129                 return r;
130 
131         r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
132         if (r < 0)
133                 return r;
134 
135         p->prog_type = info.type;
136         *ret = TAKE_PTR(p);
137 
138         return 0;
139 }
140 
141 
bpf_program_add_instructions(BPFProgram * p,const struct bpf_insn * instructions,size_t count)142 int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
143 
144         assert(p);
145 
146         if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
147                 return -EBUSY;
148 
149         if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
150                 return -ENOMEM;
151 
152         memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
153         p->n_instructions += count;
154 
155         return 0;
156 }
157 
bpf_program_load_kernel(BPFProgram * p,char * log_buf,size_t log_size)158 int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
159         union bpf_attr attr;
160 
161         assert(p);
162 
163         if (p->kernel_fd >= 0) { /* make this idempotent */
164                 memzero(log_buf, log_size);
165                 return 0;
166         }
167 
168         // FIXME: Clang doesn't 0-pad with structured initialization, causing
169         // the kernel to reject the bpf_attr as invalid. See:
170         // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
171         // Ideally it should behave like GCC, so that we can remove these workarounds.
172         zero(attr);
173         attr.prog_type = p->prog_type;
174         attr.insns = PTR_TO_UINT64(p->instructions);
175         attr.insn_cnt = p->n_instructions;
176         attr.license = PTR_TO_UINT64("GPL");
177         attr.log_buf = PTR_TO_UINT64(log_buf);
178         attr.log_level = !!log_buf;
179         attr.log_size = log_size;
180         if (p->prog_name)
181                 strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1);
182 
183         p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
184         if (p->kernel_fd < 0)
185                 return -errno;
186 
187         return 0;
188 }
189 
bpf_program_load_from_bpf_fs(BPFProgram * p,const char * path)190 int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
191         union bpf_attr attr;
192 
193         assert(p);
194 
195         if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
196                 return -EBUSY;
197 
198         zero(attr);
199         attr.pathname = PTR_TO_UINT64(path);
200 
201         p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
202         if (p->kernel_fd < 0)
203                 return -errno;
204 
205         return 0;
206 }
207 
bpf_program_cgroup_attach(BPFProgram * p,int type,const char * path,uint32_t flags)208 int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
209         _cleanup_free_ char *copy = NULL;
210         _cleanup_close_ int fd = -1;
211         union bpf_attr attr;
212         int r;
213 
214         assert(p);
215         assert(type >= 0);
216         assert(path);
217 
218         if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
219                 return -EINVAL;
220 
221         /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
222         * refuse this early. */
223         if (p->attached_path) {
224                 if (!path_equal(p->attached_path, path))
225                         return -EBUSY;
226                 if (p->attached_type != type)
227                         return -EBUSY;
228                 if (p->attached_flags != flags)
229                         return -EBUSY;
230 
231                 /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
232                  * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
233                  * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
234                  * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
235                  * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
236                  * would remain in effect. */
237                 if (flags != BPF_F_ALLOW_OVERRIDE)
238                         return 0;
239         }
240 
241         /* Ensure we have a kernel object for this. */
242         r = bpf_program_load_kernel(p, NULL, 0);
243         if (r < 0)
244                 return r;
245 
246         copy = strdup(path);
247         if (!copy)
248                 return -ENOMEM;
249 
250         fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
251         if (fd < 0)
252                 return -errno;
253 
254         zero(attr);
255         attr.attach_type = type;
256         attr.target_fd = fd;
257         attr.attach_bpf_fd = p->kernel_fd;
258         attr.attach_flags = flags;
259 
260         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
261                 return -errno;
262 
263         free_and_replace(p->attached_path, copy);
264         p->attached_type = type;
265         p->attached_flags = flags;
266 
267         return 0;
268 }
269 
bpf_program_cgroup_detach(BPFProgram * p)270 int bpf_program_cgroup_detach(BPFProgram *p) {
271         _cleanup_close_ int fd = -1;
272 
273         assert(p);
274 
275         if (!p->attached_path)
276                 return -EUNATCH;
277 
278         fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
279         if (fd < 0) {
280                 if (errno != ENOENT)
281                         return -errno;
282 
283                 /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
284                  * implicitly by the removal, hence don't complain */
285 
286         } else {
287                 union bpf_attr attr;
288 
289                 zero(attr);
290                 attr.attach_type = p->attached_type;
291                 attr.target_fd = fd;
292                 attr.attach_bpf_fd = p->kernel_fd;
293 
294                 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
295                         return -errno;
296         }
297 
298         p->attached_path = mfree(p->attached_path);
299 
300         return 0;
301 }
302 
bpf_map_new(enum bpf_map_type type,size_t key_size,size_t value_size,size_t max_entries,uint32_t flags)303 int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
304         union bpf_attr attr;
305 
306         zero(attr);
307         attr.map_type = type;
308         attr.key_size = key_size;
309         attr.value_size = value_size;
310         attr.max_entries = max_entries;
311         attr.map_flags = flags;
312 
313         return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr)));
314 }
315 
bpf_map_update_element(int fd,const void * key,void * value)316 int bpf_map_update_element(int fd, const void *key, void *value) {
317         union bpf_attr attr;
318 
319         zero(attr);
320         attr.map_fd = fd;
321         attr.key = PTR_TO_UINT64(key);
322         attr.value = PTR_TO_UINT64(value);
323 
324         return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)));
325 }
326 
bpf_map_lookup_element(int fd,const void * key,void * value)327 int bpf_map_lookup_element(int fd, const void *key, void *value) {
328         union bpf_attr attr;
329 
330         zero(attr);
331         attr.map_fd = fd;
332         attr.key = PTR_TO_UINT64(key);
333         attr.value = PTR_TO_UINT64(value);
334 
335         return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)));
336 }
337 
bpf_program_pin(int prog_fd,const char * bpffs_path)338 int bpf_program_pin(int prog_fd, const char *bpffs_path) {
339         union bpf_attr attr;
340 
341         zero(attr);
342         attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
343         attr.bpf_fd = prog_fd;
344 
345         return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
346 }
347 
bpf_program_get_id_by_fd(int prog_fd,uint32_t * ret_id)348 int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
349         struct bpf_prog_info info = {};
350         int r;
351 
352         assert(ret_id);
353 
354         r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
355         if (r < 0)
356                 return r;
357 
358         *ret_id = info.id;
359 
360         return 0;
361 };
362 
bpf_program_serialize_attachment(FILE * f,FDSet * fds,const char * key,BPFProgram * p)363 int bpf_program_serialize_attachment(
364                 FILE *f,
365                 FDSet *fds,
366                 const char *key,
367                 BPFProgram *p) {
368 
369         _cleanup_free_ char *escaped = NULL;
370         int copy, r;
371 
372         if (!p || !p->attached_path)
373                 return 0;
374 
375         assert(p->kernel_fd >= 0);
376 
377         escaped = cescape(p->attached_path);
378         if (!escaped)
379                 return -ENOMEM;
380 
381         copy = fdset_put_dup(fds, p->kernel_fd);
382         if (copy < 0)
383                 return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
384 
385         r = serialize_item_format(
386                         f,
387                         key,
388                         "%i %s %s",
389                         copy,
390                         bpf_cgroup_attach_type_to_string(p->attached_type),
391                         escaped);
392         if (r < 0)
393                 return r;
394 
395         /* After serialization, let's forget the fact that this program is attached. The attachment — if you
396          * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
397          * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
398          * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
399          * want the program to be detached while freeing things, so that the attachment can be retained after
400          * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
401          * hence we set it to NULL here. */
402 
403         p->attached_path = mfree(p->attached_path);
404         return 0;
405 }
406 
bpf_program_serialize_attachment_set(FILE * f,FDSet * fds,const char * key,Set * set)407 int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
408         BPFProgram *p;
409         int r;
410 
411         SET_FOREACH(p, set) {
412                 r = bpf_program_serialize_attachment(f, fds, key, p);
413                 if (r < 0)
414                         return r;
415         }
416 
417         return 0;
418 }
419 
bpf_program_deserialize_attachment(const char * v,FDSet * fds,BPFProgram ** bpfp)420 int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
421         _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
422         _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
423         _cleanup_close_ int fd = -1;
424         ssize_t l;
425         int ifd, at, r;
426 
427         assert(v);
428         assert(bpfp);
429 
430         /* Extract first word: the fd number */
431         r = extract_first_word(&v, &sfd, NULL, 0);
432         if (r < 0)
433                 return r;
434         if (r == 0)
435                 return -EINVAL;
436 
437         r = safe_atoi(sfd, &ifd);
438         if (r < 0)
439                 return r;
440         if (ifd < 0)
441                 return -EBADF;
442 
443         /* Extract second word: the attach type */
444         r = extract_first_word(&v, &sat, NULL, 0);
445         if (r < 0)
446                 return r;
447         if (r == 0)
448                 return -EINVAL;
449 
450         at = bpf_cgroup_attach_type_from_string(sat);
451         if (at < 0)
452                 return at;
453 
454         /* The rest is the path */
455         l = cunescape(v, 0, &unescaped);
456         if (l < 0)
457                 return l;
458 
459         fd = fdset_remove(fds, ifd);
460         if (fd < 0)
461                 return fd;
462 
463         p = new(BPFProgram, 1);
464         if (!p)
465                 return -ENOMEM;
466 
467         *p = (BPFProgram) {
468                 .kernel_fd = TAKE_FD(fd),
469                 .prog_type = BPF_PROG_TYPE_UNSPEC,
470                 .attached_path = TAKE_PTR(unescaped),
471                 .attached_type = at,
472         };
473 
474         if (*bpfp)
475                 bpf_program_free(*bpfp);
476 
477         *bpfp = TAKE_PTR(p);
478         return 0;
479 }
480 
bpf_program_deserialize_attachment_set(const char * v,FDSet * fds,Set ** bpfsetp)481 int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
482         BPFProgram *p = NULL;
483         int r;
484 
485         assert(v);
486         assert(bpfsetp);
487 
488         r = bpf_program_deserialize_attachment(v, fds, &p);
489         if (r < 0)
490                 return r;
491 
492         r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
493         if (r < 0)
494                 return r;
495 
496         return 0;
497 }
498