1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <fcntl.h>
4 #include <sys/stat.h>
5 #include <sys/types.h>
6 #include <unistd.h>
7
8 #include "alloc-util.h"
9 #include "bpf-program.h"
10 #include "errno-util.h"
11 #include "escape.h"
12 #include "fd-util.h"
13 #include "memory-util.h"
14 #include "missing_syscall.h"
15 #include "path-util.h"
16 #include "serialize.h"
17 #include "string-table.h"
18
19 static const char *const bpf_cgroup_attach_type_table[__MAX_BPF_ATTACH_TYPE] = {
20 [BPF_CGROUP_INET_INGRESS] = "ingress",
21 [BPF_CGROUP_INET_EGRESS] = "egress",
22 [BPF_CGROUP_INET_SOCK_CREATE] = "sock_create",
23 [BPF_CGROUP_SOCK_OPS] = "sock_ops",
24 [BPF_CGROUP_DEVICE] = "device",
25 [BPF_CGROUP_INET4_BIND] = "bind4",
26 [BPF_CGROUP_INET6_BIND] = "bind6",
27 [BPF_CGROUP_INET4_CONNECT] = "connect4",
28 [BPF_CGROUP_INET6_CONNECT] = "connect6",
29 [BPF_CGROUP_INET4_POST_BIND] = "post_bind4",
30 [BPF_CGROUP_INET6_POST_BIND] = "post_bind6",
31 [BPF_CGROUP_UDP4_SENDMSG] = "sendmsg4",
32 [BPF_CGROUP_UDP6_SENDMSG] = "sendmsg6",
33 [BPF_CGROUP_SYSCTL] = "sysctl",
34 [BPF_CGROUP_UDP4_RECVMSG] = "recvmsg4",
35 [BPF_CGROUP_UDP6_RECVMSG] = "recvmsg6",
36 [BPF_CGROUP_GETSOCKOPT] = "getsockopt",
37 [BPF_CGROUP_SETSOCKOPT] = "setsockopt",
38 };
39
40 DEFINE_STRING_TABLE_LOOKUP(bpf_cgroup_attach_type, int);
41
42 DEFINE_HASH_OPS_WITH_KEY_DESTRUCTOR(bpf_program_hash_ops, void, trivial_hash_func, trivial_compare_func, bpf_program_free);
43
bpf_program_free(BPFProgram * p)44 BPFProgram *bpf_program_free(BPFProgram *p) {
45 if (!p)
46 return NULL;
47 /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
48 * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
49 * programs that attached one of their BPF programs to a cgroup will leave this program pinned for good with
50 * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
51 * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
52 * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
53 * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
54 * whenever we close the BPF fd. */
55 (void) bpf_program_cgroup_detach(p);
56
57 safe_close(p->kernel_fd);
58 free(p->prog_name);
59 free(p->instructions);
60 free(p->attached_path);
61
62 return mfree(p);
63 }
64
65 /* struct bpf_prog_info info must be initialized since its value is both input and output
66 * for BPF_OBJ_GET_INFO_BY_FD syscall. */
bpf_program_get_info_by_fd(int prog_fd,struct bpf_prog_info * info,uint32_t info_len)67 static int bpf_program_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, uint32_t info_len) {
68 union bpf_attr attr;
69
70 /* Explicitly memset to zero since some compilers may produce non-zero-initialized padding when
71 * structured initialization is used.
72 * Refer to https://github.com/systemd/systemd/issues/18164
73 */
74 zero(attr);
75 attr.info.bpf_fd = prog_fd;
76 attr.info.info_len = info_len;
77 attr.info.info = PTR_TO_UINT64(info);
78
79 return RET_NERRNO(bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)));
80 }
81
bpf_program_new(uint32_t prog_type,const char * prog_name,BPFProgram ** ret)82 int bpf_program_new(uint32_t prog_type, const char *prog_name, BPFProgram **ret) {
83 _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
84 _cleanup_free_ char *name = NULL;
85
86 if (prog_name) {
87 if (strlen(prog_name) >= BPF_OBJ_NAME_LEN)
88 return -ENAMETOOLONG;
89
90 name = strdup(prog_name);
91 if (!name)
92 return -ENOMEM;
93 }
94
95 p = new(BPFProgram, 1);
96 if (!p)
97 return -ENOMEM;
98
99 *p = (BPFProgram) {
100 .prog_type = prog_type,
101 .kernel_fd = -1,
102 .prog_name = TAKE_PTR(name),
103 };
104
105 *ret = TAKE_PTR(p);
106
107 return 0;
108 }
109
bpf_program_new_from_bpffs_path(const char * path,BPFProgram ** ret)110 int bpf_program_new_from_bpffs_path(const char *path, BPFProgram **ret) {
111 _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
112 struct bpf_prog_info info = {};
113 int r;
114
115 assert(path);
116 assert(ret);
117
118 p = new(BPFProgram, 1);
119 if (!p)
120 return -ENOMEM;
121
122 *p = (BPFProgram) {
123 .prog_type = BPF_PROG_TYPE_UNSPEC,
124 .kernel_fd = -1,
125 };
126
127 r = bpf_program_load_from_bpf_fs(p, path);
128 if (r < 0)
129 return r;
130
131 r = bpf_program_get_info_by_fd(p->kernel_fd, &info, sizeof(info));
132 if (r < 0)
133 return r;
134
135 p->prog_type = info.type;
136 *ret = TAKE_PTR(p);
137
138 return 0;
139 }
140
141
bpf_program_add_instructions(BPFProgram * p,const struct bpf_insn * instructions,size_t count)142 int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *instructions, size_t count) {
143
144 assert(p);
145
146 if (p->kernel_fd >= 0) /* don't allow modification after we uploaded things to the kernel */
147 return -EBUSY;
148
149 if (!GREEDY_REALLOC(p->instructions, p->n_instructions + count))
150 return -ENOMEM;
151
152 memcpy(p->instructions + p->n_instructions, instructions, sizeof(struct bpf_insn) * count);
153 p->n_instructions += count;
154
155 return 0;
156 }
157
bpf_program_load_kernel(BPFProgram * p,char * log_buf,size_t log_size)158 int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
159 union bpf_attr attr;
160
161 assert(p);
162
163 if (p->kernel_fd >= 0) { /* make this idempotent */
164 memzero(log_buf, log_size);
165 return 0;
166 }
167
168 // FIXME: Clang doesn't 0-pad with structured initialization, causing
169 // the kernel to reject the bpf_attr as invalid. See:
170 // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
171 // Ideally it should behave like GCC, so that we can remove these workarounds.
172 zero(attr);
173 attr.prog_type = p->prog_type;
174 attr.insns = PTR_TO_UINT64(p->instructions);
175 attr.insn_cnt = p->n_instructions;
176 attr.license = PTR_TO_UINT64("GPL");
177 attr.log_buf = PTR_TO_UINT64(log_buf);
178 attr.log_level = !!log_buf;
179 attr.log_size = log_size;
180 if (p->prog_name)
181 strncpy(attr.prog_name, p->prog_name, BPF_OBJ_NAME_LEN - 1);
182
183 p->kernel_fd = bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
184 if (p->kernel_fd < 0)
185 return -errno;
186
187 return 0;
188 }
189
bpf_program_load_from_bpf_fs(BPFProgram * p,const char * path)190 int bpf_program_load_from_bpf_fs(BPFProgram *p, const char *path) {
191 union bpf_attr attr;
192
193 assert(p);
194
195 if (p->kernel_fd >= 0) /* don't overwrite an assembled or loaded program */
196 return -EBUSY;
197
198 zero(attr);
199 attr.pathname = PTR_TO_UINT64(path);
200
201 p->kernel_fd = bpf(BPF_OBJ_GET, &attr, sizeof(attr));
202 if (p->kernel_fd < 0)
203 return -errno;
204
205 return 0;
206 }
207
bpf_program_cgroup_attach(BPFProgram * p,int type,const char * path,uint32_t flags)208 int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
209 _cleanup_free_ char *copy = NULL;
210 _cleanup_close_ int fd = -1;
211 union bpf_attr attr;
212 int r;
213
214 assert(p);
215 assert(type >= 0);
216 assert(path);
217
218 if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
219 return -EINVAL;
220
221 /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
222 * refuse this early. */
223 if (p->attached_path) {
224 if (!path_equal(p->attached_path, path))
225 return -EBUSY;
226 if (p->attached_type != type)
227 return -EBUSY;
228 if (p->attached_flags != flags)
229 return -EBUSY;
230
231 /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
232 * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
233 * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
234 * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
235 * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
236 * would remain in effect. */
237 if (flags != BPF_F_ALLOW_OVERRIDE)
238 return 0;
239 }
240
241 /* Ensure we have a kernel object for this. */
242 r = bpf_program_load_kernel(p, NULL, 0);
243 if (r < 0)
244 return r;
245
246 copy = strdup(path);
247 if (!copy)
248 return -ENOMEM;
249
250 fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
251 if (fd < 0)
252 return -errno;
253
254 zero(attr);
255 attr.attach_type = type;
256 attr.target_fd = fd;
257 attr.attach_bpf_fd = p->kernel_fd;
258 attr.attach_flags = flags;
259
260 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
261 return -errno;
262
263 free_and_replace(p->attached_path, copy);
264 p->attached_type = type;
265 p->attached_flags = flags;
266
267 return 0;
268 }
269
bpf_program_cgroup_detach(BPFProgram * p)270 int bpf_program_cgroup_detach(BPFProgram *p) {
271 _cleanup_close_ int fd = -1;
272
273 assert(p);
274
275 if (!p->attached_path)
276 return -EUNATCH;
277
278 fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
279 if (fd < 0) {
280 if (errno != ENOENT)
281 return -errno;
282
283 /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
284 * implicitly by the removal, hence don't complain */
285
286 } else {
287 union bpf_attr attr;
288
289 zero(attr);
290 attr.attach_type = p->attached_type;
291 attr.target_fd = fd;
292 attr.attach_bpf_fd = p->kernel_fd;
293
294 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
295 return -errno;
296 }
297
298 p->attached_path = mfree(p->attached_path);
299
300 return 0;
301 }
302
bpf_map_new(enum bpf_map_type type,size_t key_size,size_t value_size,size_t max_entries,uint32_t flags)303 int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags) {
304 union bpf_attr attr;
305
306 zero(attr);
307 attr.map_type = type;
308 attr.key_size = key_size;
309 attr.value_size = value_size;
310 attr.max_entries = max_entries;
311 attr.map_flags = flags;
312
313 return RET_NERRNO(bpf(BPF_MAP_CREATE, &attr, sizeof(attr)));
314 }
315
bpf_map_update_element(int fd,const void * key,void * value)316 int bpf_map_update_element(int fd, const void *key, void *value) {
317 union bpf_attr attr;
318
319 zero(attr);
320 attr.map_fd = fd;
321 attr.key = PTR_TO_UINT64(key);
322 attr.value = PTR_TO_UINT64(value);
323
324 return RET_NERRNO(bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)));
325 }
326
bpf_map_lookup_element(int fd,const void * key,void * value)327 int bpf_map_lookup_element(int fd, const void *key, void *value) {
328 union bpf_attr attr;
329
330 zero(attr);
331 attr.map_fd = fd;
332 attr.key = PTR_TO_UINT64(key);
333 attr.value = PTR_TO_UINT64(value);
334
335 return RET_NERRNO(bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)));
336 }
337
bpf_program_pin(int prog_fd,const char * bpffs_path)338 int bpf_program_pin(int prog_fd, const char *bpffs_path) {
339 union bpf_attr attr;
340
341 zero(attr);
342 attr.pathname = PTR_TO_UINT64((void *) bpffs_path);
343 attr.bpf_fd = prog_fd;
344
345 return RET_NERRNO(bpf(BPF_OBJ_PIN, &attr, sizeof(attr)));
346 }
347
bpf_program_get_id_by_fd(int prog_fd,uint32_t * ret_id)348 int bpf_program_get_id_by_fd(int prog_fd, uint32_t *ret_id) {
349 struct bpf_prog_info info = {};
350 int r;
351
352 assert(ret_id);
353
354 r = bpf_program_get_info_by_fd(prog_fd, &info, sizeof(info));
355 if (r < 0)
356 return r;
357
358 *ret_id = info.id;
359
360 return 0;
361 };
362
bpf_program_serialize_attachment(FILE * f,FDSet * fds,const char * key,BPFProgram * p)363 int bpf_program_serialize_attachment(
364 FILE *f,
365 FDSet *fds,
366 const char *key,
367 BPFProgram *p) {
368
369 _cleanup_free_ char *escaped = NULL;
370 int copy, r;
371
372 if (!p || !p->attached_path)
373 return 0;
374
375 assert(p->kernel_fd >= 0);
376
377 escaped = cescape(p->attached_path);
378 if (!escaped)
379 return -ENOMEM;
380
381 copy = fdset_put_dup(fds, p->kernel_fd);
382 if (copy < 0)
383 return log_error_errno(copy, "Failed to add BPF kernel fd to serialize: %m");
384
385 r = serialize_item_format(
386 f,
387 key,
388 "%i %s %s",
389 copy,
390 bpf_cgroup_attach_type_to_string(p->attached_type),
391 escaped);
392 if (r < 0)
393 return r;
394
395 /* After serialization, let's forget the fact that this program is attached. The attachment — if you
396 * so will — is now 'owned' by the serialization, and not us anymore. Why does that matter? Because
397 * of BPF's less-than-ideal lifecycle handling: to detach a program from a cgroup we have to
398 * explicitly do so, it's not done implicitly on close(). Now, since we are serializing here we don't
399 * want the program to be detached while freeing things, so that the attachment can be retained after
400 * deserializing again. bpf_program_free() implicitly detaches things, if attached_path is non-NULL,
401 * hence we set it to NULL here. */
402
403 p->attached_path = mfree(p->attached_path);
404 return 0;
405 }
406
bpf_program_serialize_attachment_set(FILE * f,FDSet * fds,const char * key,Set * set)407 int bpf_program_serialize_attachment_set(FILE *f, FDSet *fds, const char *key, Set *set) {
408 BPFProgram *p;
409 int r;
410
411 SET_FOREACH(p, set) {
412 r = bpf_program_serialize_attachment(f, fds, key, p);
413 if (r < 0)
414 return r;
415 }
416
417 return 0;
418 }
419
bpf_program_deserialize_attachment(const char * v,FDSet * fds,BPFProgram ** bpfp)420 int bpf_program_deserialize_attachment(const char *v, FDSet *fds, BPFProgram **bpfp) {
421 _cleanup_free_ char *sfd = NULL, *sat = NULL, *unescaped = NULL;
422 _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
423 _cleanup_close_ int fd = -1;
424 ssize_t l;
425 int ifd, at, r;
426
427 assert(v);
428 assert(bpfp);
429
430 /* Extract first word: the fd number */
431 r = extract_first_word(&v, &sfd, NULL, 0);
432 if (r < 0)
433 return r;
434 if (r == 0)
435 return -EINVAL;
436
437 r = safe_atoi(sfd, &ifd);
438 if (r < 0)
439 return r;
440 if (ifd < 0)
441 return -EBADF;
442
443 /* Extract second word: the attach type */
444 r = extract_first_word(&v, &sat, NULL, 0);
445 if (r < 0)
446 return r;
447 if (r == 0)
448 return -EINVAL;
449
450 at = bpf_cgroup_attach_type_from_string(sat);
451 if (at < 0)
452 return at;
453
454 /* The rest is the path */
455 l = cunescape(v, 0, &unescaped);
456 if (l < 0)
457 return l;
458
459 fd = fdset_remove(fds, ifd);
460 if (fd < 0)
461 return fd;
462
463 p = new(BPFProgram, 1);
464 if (!p)
465 return -ENOMEM;
466
467 *p = (BPFProgram) {
468 .kernel_fd = TAKE_FD(fd),
469 .prog_type = BPF_PROG_TYPE_UNSPEC,
470 .attached_path = TAKE_PTR(unescaped),
471 .attached_type = at,
472 };
473
474 if (*bpfp)
475 bpf_program_free(*bpfp);
476
477 *bpfp = TAKE_PTR(p);
478 return 0;
479 }
480
bpf_program_deserialize_attachment_set(const char * v,FDSet * fds,Set ** bpfsetp)481 int bpf_program_deserialize_attachment_set(const char *v, FDSet *fds, Set **bpfsetp) {
482 BPFProgram *p = NULL;
483 int r;
484
485 assert(v);
486 assert(bpfsetp);
487
488 r = bpf_program_deserialize_attachment(v, fds, &p);
489 if (r < 0)
490 return r;
491
492 r = set_ensure_consume(bpfsetp, &bpf_program_hash_ops, p);
493 if (r < 0)
494 return r;
495
496 return 0;
497 }
498