1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <arpa/inet.h>
4 #include <assert.h>
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <linux/bpf_insn.h>
8 #include <net/ethernet.h>
9 #include <net/if.h>
10 #include <netinet/ip.h>
11 #include <netinet/ip6.h>
12 #include <stddef.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <unistd.h>
16 
17 #include "alloc-util.h"
18 #include "bpf-firewall.h"
19 #include "bpf-program.h"
20 #include "fd-util.h"
21 #include "in-addr-prefix-util.h"
22 #include "memory-util.h"
23 #include "missing_syscall.h"
24 #include "unit.h"
25 #include "strv.h"
26 #include "virt.h"
27 
28 enum {
29         MAP_KEY_PACKETS,
30         MAP_KEY_BYTES,
31 };
32 
33 enum {
34         ACCESS_ALLOWED = 1,
35         ACCESS_DENIED  = 2,
36 };
37 
38 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
39 
add_lookup_instructions(BPFProgram * p,int map_fd,int protocol,bool is_ingress,int verdict)40 static int add_lookup_instructions(
41                 BPFProgram *p,
42                 int map_fd,
43                 int protocol,
44                 bool is_ingress,
45                 int verdict) {
46 
47         int r, addr_offset, addr_size;
48 
49         assert(p);
50         assert(map_fd >= 0);
51 
52         switch (protocol) {
53 
54         case ETH_P_IP:
55                 addr_size = sizeof(uint32_t);
56                 addr_offset = is_ingress ?
57                         offsetof(struct iphdr, saddr) :
58                         offsetof(struct iphdr, daddr);
59                 break;
60 
61         case ETH_P_IPV6:
62                 addr_size = 4 * sizeof(uint32_t);
63                 addr_offset = is_ingress ?
64                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
65                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
66                 break;
67 
68         default:
69                 return -EAFNOSUPPORT;
70         }
71 
72         do {
73                 /* Compare IPv4 with one word instruction (32bit) */
74                 struct bpf_insn insn[] = {
75                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
76                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
77 
78                         /*
79                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
80                          *
81                          * R1: Pointer to the skb
82                          * R2: Data offset
83                          * R3: Destination buffer on the stack (r10 - 4)
84                          * R4: Number of bytes to read (4)
85                          */
86 
87                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
88                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
89 
90                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
91                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
92 
93                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
94                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
95 
96                         /*
97                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
98                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
99                          * has to be set to the maximum possible value.
100                          *
101                          * On success, the looked up value is stored in R0. For this application, the actual
102                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
103                          * matching value.
104                          */
105 
106                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
107                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
108                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
109                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
110 
111                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
112                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
113                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
114                 };
115 
116                 /* Jump label fixup */
117                 insn[0].off = ELEMENTSOF(insn) - 1;
118 
119                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
120                 if (r < 0)
121                         return r;
122 
123         } while (false);
124 
125         return 0;
126 }
127 
add_instructions_for_ip_any(BPFProgram * p,int verdict)128 static int add_instructions_for_ip_any(
129                 BPFProgram *p,
130                 int verdict) {
131         int r;
132 
133         assert(p);
134 
135         const struct bpf_insn insn[] = {
136                 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
137         };
138 
139         r = bpf_program_add_instructions(p, insn, 1);
140         if (r < 0)
141                 return r;
142 
143         return 0;
144 }
145 
bpf_firewall_compile_bpf(Unit * u,const char * prog_name,bool is_ingress,BPFProgram ** ret,bool ip_allow_any,bool ip_deny_any)146 static int bpf_firewall_compile_bpf(
147                 Unit *u,
148                 const char *prog_name,
149                 bool is_ingress,
150                 BPFProgram **ret,
151                 bool ip_allow_any,
152                 bool ip_deny_any) {
153 
154         const struct bpf_insn pre_insn[] = {
155                 /*
156                  * When the eBPF program is entered, R1 contains the address of the skb.
157                  * However, R1-R5 are scratch registers that are not preserved when calling
158                  * into kernel functions, so we need to save anything that's supposed to
159                  * stay around to R6-R9. Save the skb to R6.
160                  */
161                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
162 
163                 /*
164                  * Although we cannot access the skb data directly from eBPF programs used in this
165                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
166                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
167                  * for later use.
168                  */
169                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
170 
171                 /*
172                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
173                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
174                  */
175                 BPF_MOV32_IMM(BPF_REG_8, 0),
176         };
177 
178         /*
179          * The access checkers compiled for the configured allowance and denial lists
180          * write to R8 at runtime. The following code prepares for an early exit that
181          * skip the accounting if the packet is denied.
182          *
183          * R0 = 1
184          * if (R8 == ACCESS_DENIED)
185          *     R0 = 0
186          *
187          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
188          * is allowed to pass.
189          */
190         const struct bpf_insn post_insn[] = {
191                 BPF_MOV64_IMM(BPF_REG_0, 1),
192                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
193                 BPF_MOV64_IMM(BPF_REG_0, 0),
194         };
195 
196         _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
197         int accounting_map_fd, r;
198         bool access_enabled;
199 
200         assert(u);
201         assert(ret);
202 
203         accounting_map_fd = is_ingress ?
204                 u->ip_accounting_ingress_map_fd :
205                 u->ip_accounting_egress_map_fd;
206 
207         access_enabled =
208                 u->ipv4_allow_map_fd >= 0 ||
209                 u->ipv6_allow_map_fd >= 0 ||
210                 u->ipv4_deny_map_fd >= 0 ||
211                 u->ipv6_deny_map_fd >= 0 ||
212                 ip_allow_any ||
213                 ip_deny_any;
214 
215         if (accounting_map_fd < 0 && !access_enabled) {
216                 *ret = NULL;
217                 return 0;
218         }
219 
220         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
221         if (r < 0)
222                 return r;
223 
224         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
225         if (r < 0)
226                 return r;
227 
228         if (access_enabled) {
229                 /*
230                  * The simple rule this function translates into eBPF instructions is:
231                  *
232                  * - Access will be granted when an address matches an entry in @list_allow
233                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
234                  * - Otherwise, access will be granted
235                  */
236 
237                 if (u->ipv4_deny_map_fd >= 0) {
238                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
239                         if (r < 0)
240                                 return r;
241                 }
242 
243                 if (u->ipv6_deny_map_fd >= 0) {
244                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
245                         if (r < 0)
246                                 return r;
247                 }
248 
249                 if (u->ipv4_allow_map_fd >= 0) {
250                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
251                         if (r < 0)
252                                 return r;
253                 }
254 
255                 if (u->ipv6_allow_map_fd >= 0) {
256                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
257                         if (r < 0)
258                                 return r;
259                 }
260 
261                 if (ip_allow_any) {
262                         r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
263                         if (r < 0)
264                                 return r;
265                 }
266 
267                 if (ip_deny_any) {
268                         r = add_instructions_for_ip_any(p, ACCESS_DENIED);
269                         if (r < 0)
270                                 return r;
271                 }
272         }
273 
274         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
275         if (r < 0)
276                 return r;
277 
278         if (accounting_map_fd >= 0) {
279                 struct bpf_insn insn[] = {
280                         /*
281                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
282                          * The jump label will be fixed up later.
283                          */
284                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
285 
286                         /* Count packets */
287                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
288                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
289                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
290                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
291                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
292                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
293                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
294                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
295                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
296 
297                         /* Count bytes */
298                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
299                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
300                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
301                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
302                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
303                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
304                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
305                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
306                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
307 
308                         /* Allow the packet to pass */
309                         BPF_MOV64_IMM(BPF_REG_0, 1),
310                 };
311 
312                 /* Jump label fixup */
313                 insn[0].off = ELEMENTSOF(insn) - 1;
314 
315                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
316                 if (r < 0)
317                         return r;
318         }
319 
320         do {
321                 /*
322                  * Exit from the eBPF program, R0 contains the verdict.
323                  * 0 means the packet is denied, 1 means the packet may pass.
324                  */
325                 const struct bpf_insn insn[] = {
326                         BPF_EXIT_INSN()
327                 };
328 
329                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
330                 if (r < 0)
331                         return r;
332         } while (false);
333 
334         *ret = TAKE_PTR(p);
335 
336         return 0;
337 }
338 
bpf_firewall_count_access_items(Set * prefixes,size_t * n_ipv4,size_t * n_ipv6)339 static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
340         struct in_addr_prefix *a;
341 
342         assert(n_ipv4);
343         assert(n_ipv6);
344 
345         SET_FOREACH(a, prefixes)
346                 switch (a->family) {
347 
348                 case AF_INET:
349                         (*n_ipv4)++;
350                         break;
351 
352                 case AF_INET6:
353                         (*n_ipv6)++;
354                         break;
355 
356                 default:
357                         return -EAFNOSUPPORT;
358                 }
359 
360         return 0;
361 }
362 
bpf_firewall_add_access_items(Set * prefixes,int ipv4_map_fd,int ipv6_map_fd,int verdict)363 static int bpf_firewall_add_access_items(
364                 Set *prefixes,
365                 int ipv4_map_fd,
366                 int ipv6_map_fd,
367                 int verdict) {
368 
369         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
370         struct in_addr_prefix *a;
371         uint64_t value = verdict;
372         int r;
373 
374         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
375         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
376 
377         SET_FOREACH(a, prefixes)
378                 switch (a->family) {
379 
380                 case AF_INET:
381                         key_ipv4->prefixlen = a->prefixlen;
382                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
383 
384                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
385                         if (r < 0)
386                                 return r;
387 
388                         break;
389 
390                 case AF_INET6:
391                         key_ipv6->prefixlen = a->prefixlen;
392                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
393 
394                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
395                         if (r < 0)
396                                 return r;
397 
398                         break;
399 
400                 default:
401                         return -EAFNOSUPPORT;
402                 }
403 
404         return 0;
405 }
406 
bpf_firewall_prepare_access_maps(Unit * u,int verdict,int * ret_ipv4_map_fd,int * ret_ipv6_map_fd,bool * ret_has_any)407 static int bpf_firewall_prepare_access_maps(
408                 Unit *u,
409                 int verdict,
410                 int *ret_ipv4_map_fd,
411                 int *ret_ipv6_map_fd,
412                 bool *ret_has_any) {
413 
414         _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
415         size_t n_ipv4 = 0, n_ipv6 = 0;
416         Unit *p;
417         int r;
418 
419         assert(ret_ipv4_map_fd);
420         assert(ret_ipv6_map_fd);
421         assert(ret_has_any);
422 
423         for (p = u; p; p = UNIT_GET_SLICE(p)) {
424                 CGroupContext *cc;
425                 Set *prefixes;
426                 bool *reduced;
427 
428                 cc = unit_get_cgroup_context(p);
429                 if (!cc)
430                         continue;
431 
432                 prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
433                 reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
434 
435                 if (!*reduced) {
436                         r = in_addr_prefixes_reduce(prefixes);
437                         if (r < 0)
438                                 return r;
439 
440                         *reduced = true;
441                 }
442 
443                 bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
444 
445                 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
446                  * needing CAP_SYS_ADMIN for allocating LPM trie map. */
447                 if (in_addr_prefixes_is_any(prefixes)) {
448                         *ret_has_any = true;
449                         return 0;
450                 }
451         }
452 
453         if (n_ipv4 > 0) {
454                 ipv4_map_fd = bpf_map_new(
455                                 BPF_MAP_TYPE_LPM_TRIE,
456                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
457                                 sizeof(uint64_t),
458                                 n_ipv4,
459                                 BPF_F_NO_PREALLOC);
460                 if (ipv4_map_fd < 0)
461                         return ipv4_map_fd;
462         }
463 
464         if (n_ipv6 > 0) {
465                 ipv6_map_fd = bpf_map_new(
466                                 BPF_MAP_TYPE_LPM_TRIE,
467                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
468                                 sizeof(uint64_t),
469                                 n_ipv6,
470                                 BPF_F_NO_PREALLOC);
471                 if (ipv6_map_fd < 0)
472                         return ipv6_map_fd;
473         }
474 
475         for (p = u; p; p = UNIT_GET_SLICE(p)) {
476                 CGroupContext *cc;
477 
478                 cc = unit_get_cgroup_context(p);
479                 if (!cc)
480                         continue;
481 
482                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
483                                                   ipv4_map_fd, ipv6_map_fd, verdict);
484                 if (r < 0)
485                         return r;
486         }
487 
488         *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
489         *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
490         *ret_has_any = false;
491         return 0;
492 }
493 
bpf_firewall_prepare_accounting_maps(Unit * u,bool enabled,int * fd_ingress,int * fd_egress)494 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
495         int r;
496 
497         assert(u);
498         assert(fd_ingress);
499         assert(fd_egress);
500 
501         if (enabled) {
502                 if (*fd_ingress < 0) {
503                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
504                         if (r < 0)
505                                 return r;
506 
507                         *fd_ingress = r;
508                 }
509 
510                 if (*fd_egress < 0) {
511 
512                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
513                         if (r < 0)
514                                 return r;
515 
516                         *fd_egress = r;
517                 }
518 
519         } else {
520                 *fd_ingress = safe_close(*fd_ingress);
521                 *fd_egress = safe_close(*fd_egress);
522 
523                 zero(u->ip_accounting_extra);
524         }
525 
526         return 0;
527 }
528 
bpf_firewall_compile(Unit * u)529 int bpf_firewall_compile(Unit *u) {
530         const char *ingress_name = NULL, *egress_name = NULL;
531         bool ip_allow_any = false, ip_deny_any = false;
532         CGroupContext *cc;
533         int r, supported;
534 
535         assert(u);
536 
537         cc = unit_get_cgroup_context(u);
538         if (!cc)
539                 return -EINVAL;
540 
541         supported = bpf_firewall_supported();
542         if (supported < 0)
543                 return supported;
544         if (supported == BPF_FIREWALL_UNSUPPORTED)
545                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
546                                             "BPF firewalling not supported on this manager, proceeding without.");
547         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
548                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
549                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
550                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
551                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
552                  * all, either. */
553                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
554                                             "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
555 
556         /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
557          * kernel). */
558         if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
559                 ingress_name = "sd_fw_ingress";
560                 egress_name = "sd_fw_egress";
561         }
562 
563         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
564          * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
565          * configuration, but we don't flush out the accounting unnecessarily */
566 
567         u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
568         u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
569 
570         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
571         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
572 
573         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
574         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
575 
576         if (u->type != UNIT_SLICE) {
577                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
578                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
579                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
580                  * means that all configure IP access rules *will* take effect on processes, even though we never
581                  * compile them for inner nodes. */
582 
583                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
584                 if (r < 0)
585                         return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
586 
587                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
588                 if (r < 0)
589                         return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
590         }
591 
592         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
593         if (r < 0)
594                 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
595 
596         r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
597         if (r < 0)
598                 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
599 
600         r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
601         if (r < 0)
602                 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
603 
604         return 0;
605 }
606 
load_bpf_progs_from_fs_to_set(Unit * u,char ** filter_paths,Set ** set)607 static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
608         set_clear(*set);
609 
610         STRV_FOREACH(bpf_fs_path, filter_paths) {
611                 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
612                 int r;
613 
614                 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
615                 if (r < 0)
616                         return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
617 
618                 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
619                 if (r < 0)
620                         return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
621 
622                 r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
623                 if (r < 0)
624                         return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
625         }
626 
627         return 0;
628 }
629 
bpf_firewall_load_custom(Unit * u)630 int bpf_firewall_load_custom(Unit *u) {
631         CGroupContext *cc;
632         int r, supported;
633 
634         assert(u);
635 
636         cc = unit_get_cgroup_context(u);
637         if (!cc)
638                 return 0;
639 
640         if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
641                 return 0;
642 
643         supported = bpf_firewall_supported();
644         if (supported < 0)
645                 return supported;
646 
647         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
648                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
649 
650         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
651         if (r < 0)
652                 return r;
653         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
654         if (r < 0)
655                 return r;
656 
657         return 0;
658 }
659 
attach_custom_bpf_progs(Unit * u,const char * path,int attach_type,Set ** set,Set ** set_installed)660 static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
661         BPFProgram *prog;
662         int r;
663 
664         assert(u);
665 
666         set_clear(*set_installed);
667         r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
668         if (r < 0)
669                 return log_oom();
670 
671         SET_FOREACH_MOVE(prog, *set_installed, *set) {
672                 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
673                 if (r < 0)
674                         return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
675         }
676         return 0;
677 }
678 
bpf_firewall_install(Unit * u)679 int bpf_firewall_install(Unit *u) {
680         _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
681         _cleanup_free_ char *path = NULL;
682         CGroupContext *cc;
683         int r, supported;
684         uint32_t flags;
685 
686         assert(u);
687 
688         cc = unit_get_cgroup_context(u);
689         if (!cc)
690                 return -EINVAL;
691         if (!u->cgroup_path)
692                 return -EINVAL;
693         if (!u->cgroup_realized)
694                 return -EINVAL;
695 
696         supported = bpf_firewall_supported();
697         if (supported < 0)
698                 return supported;
699         if (supported == BPF_FIREWALL_UNSUPPORTED)
700                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF firewalling not supported on this manager, proceeding without.");
701         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
702                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
703         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
704             (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
705                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
706 
707         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
708         if (r < 0)
709                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
710 
711         flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
712 
713         if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
714                 /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
715                  * after attaching the new programs, so that there's no time window where neither program is
716                  * attached. (There will be a program where both are attached, but that's OK, since this is a
717                  * security feature where we rather want to lock down too much than too little */
718                 ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
719                 ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
720         } else {
721                 /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
722                  * detach them) right before attaching the new program, to minimize the time window when we
723                  * don't account for IP traffic. */
724                 u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
725                 u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
726         }
727 
728         if (u->ip_bpf_egress) {
729                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
730                 if (r < 0)
731                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
732 
733                 /* Remember that this BPF program is installed now. */
734                 u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
735         }
736 
737         if (u->ip_bpf_ingress) {
738                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
739                 if (r < 0)
740                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
741 
742                 u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
743         }
744 
745         /* And now, definitely get rid of the old programs, and detach them */
746         ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
747         ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
748 
749         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
750         if (r < 0)
751                 return r;
752 
753         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
754         if (r < 0)
755                 return r;
756 
757         return 0;
758 }
759 
bpf_firewall_read_accounting(int map_fd,uint64_t * ret_bytes,uint64_t * ret_packets)760 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
761         uint64_t key, packets;
762         int r;
763 
764         if (map_fd < 0)
765                 return -EBADF;
766 
767         if (ret_packets) {
768                 key = MAP_KEY_PACKETS;
769                 r = bpf_map_lookup_element(map_fd, &key, &packets);
770                 if (r < 0)
771                         return r;
772         }
773 
774         if (ret_bytes) {
775                 key = MAP_KEY_BYTES;
776                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
777                 if (r < 0)
778                         return r;
779         }
780 
781         if (ret_packets)
782                 *ret_packets = packets;
783 
784         return 0;
785 }
786 
bpf_firewall_reset_accounting(int map_fd)787 int bpf_firewall_reset_accounting(int map_fd) {
788         uint64_t key, value = 0;
789         int r;
790 
791         if (map_fd < 0)
792                 return -EBADF;
793 
794         key = MAP_KEY_PACKETS;
795         r = bpf_map_update_element(map_fd, &key, &value);
796         if (r < 0)
797                 return r;
798 
799         key = MAP_KEY_BYTES;
800         return bpf_map_update_element(map_fd, &key, &value);
801 }
802 
803 static int bpf_firewall_unsupported_reason = 0;
804 
bpf_firewall_supported(void)805 int bpf_firewall_supported(void) {
806         const struct bpf_insn trivial[] = {
807                 BPF_MOV64_IMM(BPF_REG_0, 1),
808                 BPF_EXIT_INSN()
809         };
810 
811         _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
812         static int supported = -1;
813         union bpf_attr attr;
814         int r;
815 
816         /* Checks whether BPF firewalling is supported. For this, we check the following things:
817          *
818          * - whether the unified hierarchy is being used
819          * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
820          * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
821          */
822         if (supported >= 0)
823                 return supported;
824 
825         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
826         if (r < 0)
827                 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
828         if (r == 0) {
829                 bpf_firewall_unsupported_reason =
830                         log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
831                                         "Not running with unified cgroups, BPF firewalling is not supported.");
832                 return supported = BPF_FIREWALL_UNSUPPORTED;
833         }
834 
835         /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
836         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
837         if (r < 0) {
838                 bpf_firewall_unsupported_reason =
839                         log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
840                 return supported = BPF_FIREWALL_UNSUPPORTED;
841         }
842 
843         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
844         if (r < 0) {
845                 bpf_firewall_unsupported_reason =
846                         log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
847                 return supported = BPF_FIREWALL_UNSUPPORTED;
848         }
849 
850         r = bpf_program_load_kernel(program, NULL, 0);
851         if (r < 0) {
852                 bpf_firewall_unsupported_reason =
853                         log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
854                 return supported = BPF_FIREWALL_UNSUPPORTED;
855         }
856 
857         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
858          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
859          * program if we can't do a thing with it later?
860          *
861          * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
862          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
863          * parameters are validated however, and that'll fail with EBADF then. */
864 
865         // FIXME: Clang doesn't 0-pad with structured initialization, causing
866         // the kernel to reject the bpf_attr as invalid. See:
867         // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
868         // Ideally it should behave like GCC, so that we can remove these workarounds.
869         zero(attr);
870         attr.attach_type = BPF_CGROUP_INET_EGRESS;
871         attr.target_fd = -1;
872         attr.attach_bpf_fd = -1;
873 
874         if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
875                 if (errno != EBADF) {
876                         bpf_firewall_unsupported_reason =
877                                 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
878                         return supported = BPF_FIREWALL_UNSUPPORTED;
879                 }
880 
881                 /* YAY! */
882         } else {
883                 bpf_firewall_unsupported_reason =
884                         log_debug_errno(SYNTHETIC_ERRNO(EBADE),
885                                         "Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
886                                         "Something is weird, assuming BPF firewalling is broken and hence not supported.");
887                 return supported = BPF_FIREWALL_UNSUPPORTED;
888         }
889 
890         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
891          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
892          * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
893          * get EINVAL if it's not supported, and EBADF as before if it is available.
894          * Use probe result as the indicator that program name is also supported since they both were
895          * added in kernel 4.15. */
896 
897         zero(attr);
898         attr.attach_type = BPF_CGROUP_INET_EGRESS;
899         attr.target_fd = -1;
900         attr.attach_bpf_fd = -1;
901         attr.attach_flags = BPF_F_ALLOW_MULTI;
902 
903         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
904                 if (errno == EBADF) {
905                         log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
906                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
907                 }
908 
909                 if (errno == EINVAL)
910                         log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
911                 else
912                         log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
913 
914                 return supported = BPF_FIREWALL_SUPPORTED;
915         } else {
916                 bpf_firewall_unsupported_reason =
917                         log_debug_errno(SYNTHETIC_ERRNO(EBADE),
918                                         "Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
919                                         "Something is weird, assuming BPF firewalling is broken and hence not supported.");
920                 return supported = BPF_FIREWALL_UNSUPPORTED;
921         }
922 }
923 
emit_bpf_firewall_warning(Unit * u)924 void emit_bpf_firewall_warning(Unit *u) {
925         static bool warned = false;
926 
927         assert(u);
928         assert(u->manager);
929 
930         if (warned || MANAGER_IS_TEST_RUN(u->manager))
931                 return;
932 
933         bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
934 
935         log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
936                             "unit configures an IP firewall, but %s.\n"
937                             "(This warning is only shown for the first unit using IP firewalling.)",
938                             getuid() != 0 ? "not running as root" :
939                             "the local system does not support BPF/cgroup firewalling");
940         warned = true;
941 }
942 
bpf_firewall_close(Unit * u)943 void bpf_firewall_close(Unit *u) {
944         assert(u);
945 
946         u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
947         u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
948 
949         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
950         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
951         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
952         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
953 
954         u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
955         u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
956         u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
957         u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
958 
959         u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
960         u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
961         u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
962         u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
963 }
964