1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <arpa/inet.h>
4 #include <endian.h>
5 #include <errno.h>
6 #include <stddef.h>
7 #include <string.h>
8 #include <linux/netfilter/nf_tables.h>
9 #include <linux/netfilter/nf_nat.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <netinet/ip.h>
12 #include <netinet/ip6.h>
13
14 #include "sd-netlink.h"
15
16 #include "alloc-util.h"
17 #include "firewall-util.h"
18 #include "firewall-util-private.h"
19 #include "in-addr-util.h"
20 #include "macro.h"
21 #include "socket-util.h"
22 #include "time-util.h"
23
24 #define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
25 #define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
26 #define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
27
28 #define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
29
30 #define UDP_DPORT_OFFSET 2
31
nfnl_netlink_sendv(sd_netlink * nfnl,sd_netlink_message * messages[static1],size_t msgcount)32 static int nfnl_netlink_sendv(
33 sd_netlink *nfnl,
34 sd_netlink_message *messages[static 1],
35 size_t msgcount) {
36
37 _cleanup_free_ uint32_t *serial = NULL;
38 int r;
39
40 assert(nfnl);
41 assert(messages);
42 assert(msgcount > 0);
43
44 r = sd_netlink_sendv(nfnl, messages, msgcount, &serial);
45 if (r < 0)
46 return r;
47
48 r = 0;
49 for (size_t i = 1; i < msgcount - 1; i++) {
50 int tmp;
51
52 /* If message is an error, this returns embedded errno */
53 tmp = sd_netlink_read(nfnl, serial[i], NFNL_DEFAULT_TIMEOUT_USECS, NULL);
54 if (tmp < 0 && r == 0)
55 r = tmp;
56 }
57
58 return r;
59 }
60
nfnl_add_open_expr_container(sd_netlink_message * m,const char * name)61 static int nfnl_add_open_expr_container(sd_netlink_message *m, const char *name) {
62 int r;
63
64 r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
65 if (r < 0)
66 return r;
67
68 r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, name);
69 if (r < 0)
70 return r;
71
72 return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name);
73 }
74
nfnl_add_expr_fib(sd_netlink_message * m,uint32_t nft_fib_flags,enum nft_fib_result result,enum nft_registers dreg)75 static int nfnl_add_expr_fib(sd_netlink_message *m, uint32_t nft_fib_flags,
76 enum nft_fib_result result,
77 enum nft_registers dreg) {
78 int r;
79
80 r = nfnl_add_open_expr_container(m, "fib");
81 if (r < 0)
82 return r;
83
84 r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags));
85 if (r < 0)
86 return r;
87 r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result));
88 if (r < 0)
89 return r;
90 r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg));
91 if (r < 0)
92 return r;
93
94 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
95 if (r < 0)
96 return r;
97
98 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
99 }
100
nfnl_add_expr_meta(sd_netlink_message * m,enum nft_meta_keys key,enum nft_registers dreg)101 static int nfnl_add_expr_meta(sd_netlink_message *m, enum nft_meta_keys key,
102 enum nft_registers dreg) {
103 int r;
104
105 r = nfnl_add_open_expr_container(m, "meta");
106 if (r < 0)
107 return r;
108
109 r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key));
110 if (r < 0)
111 return r;
112
113 r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg));
114 if (r < 0)
115 return r;
116
117 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
118 if (r < 0)
119 return r;
120
121 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
122 }
123
nfnl_add_expr_payload(sd_netlink_message * m,enum nft_payload_bases pb,uint32_t offset,uint32_t len,enum nft_registers dreg)124 static int nfnl_add_expr_payload(sd_netlink_message *m, enum nft_payload_bases pb,
125 uint32_t offset, uint32_t len, enum nft_registers dreg) {
126 int r;
127
128 r = nfnl_add_open_expr_container(m, "payload");
129 if (r < 0)
130 return r;
131
132 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg));
133 if (r < 0)
134 return r;
135 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb));
136 if (r < 0)
137 return r;
138 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset));
139 if (r < 0)
140 return r;
141 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len));
142 if (r < 0)
143 return r;
144
145 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
146 if (r < 0)
147 return r;
148 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
149 }
150
nfnl_add_expr_lookup_set_data(sd_netlink_message * m,const char * set_name,enum nft_registers sreg)151 static int nfnl_add_expr_lookup_set_data(sd_netlink_message *m, const char *set_name,
152 enum nft_registers sreg) {
153 int r;
154
155 r = nfnl_add_open_expr_container(m, "lookup");
156 if (r < 0)
157 return r;
158
159 r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name);
160 if (r < 0)
161 return r;
162
163 return sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg));
164 }
165
nfnl_add_expr_lookup_set(sd_netlink_message * m,const char * set_name,enum nft_registers sreg)166 static int nfnl_add_expr_lookup_set(sd_netlink_message *m, const char *set_name,
167 enum nft_registers sreg) {
168 int r;
169
170 r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
171 if (r < 0)
172 return r;
173
174 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
175 if (r < 0)
176 return r;
177 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
178 }
179
nfnl_add_expr_lookup_map(sd_netlink_message * m,const char * set_name,enum nft_registers sreg,enum nft_registers dreg)180 static int nfnl_add_expr_lookup_map(sd_netlink_message *m, const char *set_name,
181 enum nft_registers sreg, enum nft_registers dreg) {
182 int r;
183
184 r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
185 if (r < 0)
186 return r;
187
188 r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg));
189 if (r < 0)
190 return r;
191
192 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
193 if (r < 0)
194 return r;
195
196 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
197 }
198
nfnl_add_expr_data(sd_netlink_message * m,int attr,const void * data,uint32_t dlen)199 static int nfnl_add_expr_data(sd_netlink_message *m, int attr, const void *data, uint32_t dlen) {
200 int r;
201
202 r = sd_netlink_message_open_container(m, attr);
203 if (r < 0)
204 return r;
205 r = sd_netlink_message_append_data(m, NFTA_DATA_VALUE, data, dlen);
206 if (r < 0)
207 return r;
208
209 return sd_netlink_message_close_container(m); /* attr */
210 }
211
nfnl_add_expr_cmp_data(sd_netlink_message * m,const void * data,uint32_t dlen)212 static int nfnl_add_expr_cmp_data(sd_netlink_message *m, const void *data, uint32_t dlen) {
213 return nfnl_add_expr_data(m, NFTA_CMP_DATA, data, dlen);
214 }
215
nfnl_add_expr_cmp(sd_netlink_message * m,enum nft_cmp_ops cmp_op,enum nft_registers sreg,const void * data,uint32_t dlen)216 static int nfnl_add_expr_cmp(sd_netlink_message *m, enum nft_cmp_ops cmp_op,
217 enum nft_registers sreg, const void *data, uint32_t dlen) {
218 int r;
219
220 r = nfnl_add_open_expr_container(m, "cmp");
221 if (r < 0)
222 return r;
223
224 r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op));
225 if (r < 0)
226 return r;
227 r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg));
228 if (r < 0)
229 return r;
230
231 r = nfnl_add_expr_cmp_data(m, data, dlen);
232 if (r < 0)
233 return r;
234
235 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
236 if (r < 0)
237 return r;
238 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
239 }
240
nfnl_add_expr_bitwise(sd_netlink_message * m,enum nft_registers sreg,enum nft_registers dreg,const void * and,const void * xor,uint32_t len)241 static int nfnl_add_expr_bitwise(sd_netlink_message *m,
242 enum nft_registers sreg,
243 enum nft_registers dreg,
244 const void *and,
245 const void *xor, uint32_t len) {
246 int r;
247
248 r = nfnl_add_open_expr_container(m, "bitwise");
249 if (r < 0)
250 return r;
251
252 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg));
253 if (r < 0)
254 return r;
255 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg));
256 if (r < 0)
257 return r;
258 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len));
259 if (r < 0)
260 return r;
261
262 r = nfnl_add_expr_data(m, NFTA_BITWISE_MASK, and, len);
263 if (r < 0)
264 return r;
265
266 r = nfnl_add_expr_data(m, NFTA_BITWISE_XOR, xor, len);
267 if (r < 0)
268 return r;
269
270 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
271 if (r < 0)
272 return r;
273 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
274 }
275
nfnl_add_expr_dnat(sd_netlink_message * m,int family,enum nft_registers areg,enum nft_registers preg)276 static int nfnl_add_expr_dnat(sd_netlink_message *m,
277 int family,
278 enum nft_registers areg,
279 enum nft_registers preg) {
280 int r;
281
282 r = nfnl_add_open_expr_container(m, "nat");
283 if (r < 0)
284 return r;
285
286 r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT));
287 if (r < 0)
288 return r;
289
290 r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family));
291 if (r < 0)
292 return r;
293
294 r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg));
295 if (r < 0)
296 return r;
297 r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg));
298 if (r < 0)
299 return r;
300 r = sd_netlink_message_close_container(m);
301 if (r < 0)
302 return r;
303
304 return sd_netlink_message_close_container(m);
305 }
306
nfnl_add_expr_masq(sd_netlink_message * m)307 static int nfnl_add_expr_masq(sd_netlink_message *m) {
308 int r;
309
310 r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
311 if (r < 0)
312 return r;
313
314 r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq");
315 if (r < 0)
316 return r;
317
318 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
319 }
320
sd_nfnl_message_new_masq_rule(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * chain)321 static int sd_nfnl_message_new_masq_rule(sd_netlink *nfnl, sd_netlink_message **ret, int family,
322 const char *chain) {
323 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
324 int r;
325
326 /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
327
328 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
329 if (r < 0)
330 return r;
331
332 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
333 if (r < 0)
334 return r;
335
336 /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
337 if (family == AF_INET)
338 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr),
339 sizeof(uint32_t), NFT_REG32_01);
340 else
341 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_src.s6_addr),
342 sizeof(struct in6_addr), NFT_REG32_01);
343 if (r < 0)
344 return r;
345
346 /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
347 r = nfnl_add_expr_lookup_set(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01);
348 if (r < 0)
349 return r;
350
351 /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
352 r = nfnl_add_expr_masq(m);
353 if (r < 0)
354 return r;
355
356 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
357 if (r < 0)
358 return r;
359 *ret = TAKE_PTR(m);
360 return 0;
361 }
362
sd_nfnl_message_new_dnat_rule_pre(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * chain)363 static int sd_nfnl_message_new_dnat_rule_pre(sd_netlink *nfnl, sd_netlink_message **ret, int family,
364 const char *chain) {
365 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
366 enum nft_registers proto_reg;
367 uint32_t local = RTN_LOCAL;
368 int r;
369
370 /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
371 * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
372
373 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
374 if (r < 0)
375 return r;
376
377 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
378 if (r < 0)
379 return r;
380
381 /* 1st statement: fib daddr type local */
382 r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01);
383 if (r < 0)
384 return r;
385
386 /* 1st statement (cont.): compare RTN_LOCAL */
387 r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local));
388 if (r < 0)
389 return r;
390
391 /* 2nd statement: lookup local port in map, fetch address:dport to map to */
392 r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
393 if (r < 0)
394 return r;
395
396 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
397 sizeof(uint16_t), NFT_REG32_02);
398 if (r < 0)
399 return r;
400
401 /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
402 * store address and port for the dnat mapping in REG1/REG2.
403 */
404 r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
405 if (r < 0)
406 return r;
407
408 proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
409 r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
410 if (r < 0)
411 return r;
412
413 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
414 if (r < 0)
415 return r;
416 *ret = TAKE_PTR(m);
417 return 0;
418 }
419
sd_nfnl_message_new_dnat_rule_out(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * chain)420 static int sd_nfnl_message_new_dnat_rule_out(sd_netlink *nfnl, sd_netlink_message **ret,
421 int family, const char *chain) {
422 static const uint32_t zero = 0, one = 1;
423
424 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
425 enum nft_registers proto_reg;
426 int r;
427
428 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
429 if (r < 0)
430 return r;
431
432 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
433 if (r < 0)
434 return r;
435
436 /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
437 if (family == AF_INET) {
438 uint32_t lonet = htobe32(UINT32_C(0x7F000000)), lomask = htobe32(UINT32_C(0xff000000));
439
440 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr),
441 sizeof(lonet), NFT_REG32_01);
442 if (r < 0)
443 return r;
444 /* 1st statement (cont.): bitops/prefix */
445 r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask));
446 if (r < 0)
447 return r;
448
449 /* 1st statement (cont.): compare reg1 with 127/8 */
450 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet));
451 } else {
452 struct in6_addr loaddr = IN6ADDR_LOOPBACK_INIT;
453
454 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_dst.s6_addr),
455 sizeof(loaddr), NFT_REG32_01);
456 if (r < 0)
457 return r;
458
459 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &loaddr, sizeof(loaddr));
460 }
461 if (r < 0)
462 return r;
463
464 /* 2nd statement: meta oif lo */
465 r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01);
466 if (r < 0)
467 return r;
468
469 /* 2nd statement (cont.): compare to lo ifindex (1) */
470 r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one));
471 if (r < 0)
472 return r;
473
474 /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
475 r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
476 if (r < 0)
477 return r;
478
479 /* 3rd statement (cont): store the port number in reg2 */
480 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
481 sizeof(uint16_t), NFT_REG32_02);
482 if (r < 0)
483 return r;
484
485 /* 3rd statement (cont): use reg1 and reg2 and retrieve
486 * the new destination ip and port number.
487 *
488 * reg1 and reg2 are clobbered and will then contain the new
489 * address/port number.
490 */
491 r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
492 if (r < 0)
493 return r;
494
495 /* 4th statement: dnat connection to address/port retrieved by the
496 * preceding expression. */
497 proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
498 r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
499 if (r < 0)
500 return r;
501
502 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
503 if (r < 0)
504 return r;
505 *ret = TAKE_PTR(m);
506 return 0;
507 }
508
nft_new_set(struct sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,uint32_t set_id,uint32_t flags,uint32_t type,uint32_t klen)509 static int nft_new_set(struct sd_netlink *nfnl,
510 sd_netlink_message **ret,
511 int family, const char *set_name,
512 uint32_t set_id,
513 uint32_t flags, uint32_t type, uint32_t klen) {
514 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
515 int r;
516
517 r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen);
518 if (r < 0)
519 return r;
520
521 if (flags != 0) {
522 r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags));
523 if (r < 0)
524 return r;
525 }
526
527 r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type));
528 if (r < 0)
529 return r;
530
531 *ret = TAKE_PTR(m);
532 return r;
533 }
534
nft_new_map(struct sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,uint32_t set_id,uint32_t flags,uint32_t type,uint32_t klen,uint32_t dtype,uint32_t dlen)535 static int nft_new_map(struct sd_netlink *nfnl,
536 sd_netlink_message **ret,
537 int family, const char *set_name, uint32_t set_id,
538 uint32_t flags, uint32_t type, uint32_t klen, uint32_t dtype, uint32_t dlen) {
539 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
540 int r;
541
542 r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen);
543 if (r < 0)
544 return r;
545
546 r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype));
547 if (r < 0)
548 return r;
549
550 r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen));
551 if (r < 0)
552 return r;
553 *ret = TAKE_PTR(m);
554 return 0;
555 }
556
nft_add_element(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,const void * key,uint32_t klen,const void * data,uint32_t dlen)557 static int nft_add_element(sd_netlink *nfnl, sd_netlink_message **ret,
558 int family, const char *set_name,
559 const void *key, uint32_t klen,
560 const void *data, uint32_t dlen) {
561 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
562 int r;
563
564 /*
565 * Ideally there would be an API that provides:
566 *
567 * 1) an init function to add the main ruleset skeleton
568 * 2) a function that populates the sets with all known address/port pairs to s/dnat for
569 * 3) a function that can remove address/port pairs again.
570 *
571 * At this time, the existing API is used which is built on a
572 * 'add/delete a rule' paradigm.
573 *
574 * This replicated here and each element gets added to the set
575 * one-by-one.
576 */
577 r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
578 if (r < 0)
579 return r;
580
581 r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
582 if (r < 0)
583 return r;
584
585 /* could theoretically append more set elements to add here */
586 r = sd_nfnl_nft_message_add_setelem_end(m);
587 if (r < 0)
588 return r;
589 *ret = TAKE_PTR(m);
590 return 0;
591 }
592
nft_del_element(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,const void * key,uint32_t klen,const void * data,uint32_t dlen)593 static int nft_del_element(sd_netlink *nfnl,
594 sd_netlink_message **ret, int family, const char *set_name,
595 const void *key, uint32_t klen,
596 const void *data, uint32_t dlen) {
597 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
598 int r;
599
600 r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
601 if (r < 0)
602 return r;
603
604 r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
605 if (r < 0)
606 return r;
607
608 r = sd_nfnl_nft_message_add_setelem_end(m);
609 if (r < 0)
610 return r;
611 *ret = TAKE_PTR(m);
612 return 0;
613 }
614
615 /* This is needed so 'nft' userspace tool can properly format the contents
616 * of the set/map when someone uses 'nft' to inspect their content.
617 *
618 * The values cannot be changed, they are part of the nft tool type identifier ABI.
619 */
620 #define TYPE_BITS 6
621
622 enum nft_key_types {
623 TYPE_IPADDR = 7,
624 TYPE_IP6ADDR = 8,
625 TYPE_INET_PROTOCOL = 12,
626 TYPE_INET_SERVICE = 13,
627 };
628
concat_types2(enum nft_key_types a,enum nft_key_types b)629 static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) {
630 uint32_t type = (uint32_t)a;
631
632 type <<= TYPE_BITS;
633 type |= (uint32_t)b;
634
635 return type;
636 }
637
638 /* enough space to hold netlink messages for table skeleton */
639 #define NFT_INIT_MSGS 16
fw_nftables_init_family(sd_netlink * nfnl,int family)640 static int fw_nftables_init_family(sd_netlink *nfnl, int family) {
641 sd_netlink_message *batch[NFT_INIT_MSGS] = {};
642 size_t msgcnt = 0, i, ip_type_size;
643 uint32_t set_id = 0;
644 int ip_type, r;
645
646 assert(IN_SET(family, AF_INET, AF_INET6));
647
648 r = sd_nfnl_message_batch_begin(nfnl, &batch[msgcnt]);
649 if (r < 0)
650 goto out_unref;
651
652 msgcnt++;
653 assert(msgcnt < NFT_INIT_MSGS);
654 /* Set F_EXCL so table add fails if the table already exists. */
655 r = sd_nfnl_nft_message_new_table(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME);
656 if (r < 0)
657 goto out_unref;
658
659 msgcnt++;
660 assert(msgcnt < NFT_INIT_MSGS);
661
662 r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
663 "prerouting", "nat",
664 NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1);
665 if (r < 0)
666 goto out_unref;
667
668 msgcnt++;
669 assert(msgcnt < NFT_INIT_MSGS);
670 r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
671 "output", "nat",
672 NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1);
673 if (r < 0)
674 goto out_unref;
675
676 msgcnt++;
677 assert(msgcnt < NFT_INIT_MSGS);
678 r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
679 "postrouting", "nat",
680 NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1);
681 if (r < 0)
682 goto out_unref;
683
684 if (family == AF_INET) {
685 ip_type_size = sizeof(uint32_t);
686 ip_type = TYPE_IPADDR;
687 } else {
688 assert(family == AF_INET6);
689 ip_type_size = sizeof(struct in6_addr);
690 ip_type = TYPE_IP6ADDR;
691 }
692 msgcnt++;
693 assert(msgcnt < NFT_INIT_MSGS);
694 /* set to store ip address ranges we should masquerade for */
695 r = nft_new_set(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size);
696 if (r < 0)
697 goto out_unref;
698
699 /*
700 * map to store ip address:port pair to dnat to. elements in concatenation
701 * are rounded up to 4 bytes.
702 *
703 * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
704 * sizeof(uint8_t) + sizeof(uint16_t).
705 */
706 msgcnt++;
707 assert(msgcnt < NFT_INIT_MSGS);
708 r = nft_new_map(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0,
709 concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2,
710 concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t));
711 if (r < 0)
712 goto out_unref;
713
714 msgcnt++;
715 assert(msgcnt < NFT_INIT_MSGS);
716 r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &batch[msgcnt], family, "prerouting");
717 if (r < 0)
718 goto out_unref;
719
720 msgcnt++;
721 assert(msgcnt < NFT_INIT_MSGS);
722 r = sd_nfnl_message_new_dnat_rule_out(nfnl, &batch[msgcnt], family, "output");
723 if (r < 0)
724 goto out_unref;
725
726 msgcnt++;
727 r = sd_nfnl_message_new_masq_rule(nfnl, &batch[msgcnt], family, "postrouting");
728 if (r < 0)
729 goto out_unref;
730
731 msgcnt++;
732 assert(msgcnt < NFT_INIT_MSGS);
733 r = sd_nfnl_message_batch_end(nfnl, &batch[msgcnt]);
734 if (r < 0)
735 goto out_unref;
736
737 msgcnt++;
738 assert(msgcnt <= NFT_INIT_MSGS);
739 r = nfnl_netlink_sendv(nfnl, batch, msgcnt);
740 if (r == -EEXIST)
741 r = 0;
742
743 out_unref:
744 for (i = 0; i < msgcnt; i++)
745 sd_netlink_message_unref(batch[i]);
746
747 return r;
748 }
749
fw_nftables_init(FirewallContext * ctx)750 int fw_nftables_init(FirewallContext *ctx) {
751 _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
752 int r;
753
754 r = sd_nfnl_socket_open(&nfnl);
755 if (r < 0)
756 return r;
757
758 r = fw_nftables_init_family(nfnl, AF_INET);
759 if (r < 0)
760 return r;
761
762 if (socket_ipv6_is_supported()) {
763 r = fw_nftables_init_family(nfnl, AF_INET6);
764 if (r < 0)
765 log_debug_errno(r, "Failed to init ipv6 NAT: %m");
766 }
767
768 ctx->nfnl = TAKE_PTR(nfnl);
769 return 0;
770 }
771
fw_nftables_exit(FirewallContext * ctx)772 void fw_nftables_exit(FirewallContext *ctx) {
773 ctx->nfnl = sd_netlink_unref(ctx->nfnl);
774 }
775
nft_message_add_setelem_iprange(sd_netlink_message * m,const union in_addr_union * source,unsigned int prefixlen)776 static int nft_message_add_setelem_iprange(sd_netlink_message *m,
777 const union in_addr_union *source,
778 unsigned int prefixlen) {
779 uint32_t mask, start, end;
780 unsigned int nplen;
781 int r;
782
783 assert(prefixlen <= 32);
784 nplen = 32 - prefixlen;
785
786 mask = (1U << nplen) - 1U;
787 mask = htobe32(~mask);
788 start = source->in.s_addr & mask;
789
790 r = sd_nfnl_nft_message_add_setelem(m, 0, &start, sizeof(start), NULL, 0);
791 if (r < 0)
792 return r;
793
794 r = sd_nfnl_nft_message_add_setelem_end(m);
795 if (r < 0)
796 return r;
797
798 end = be32toh(start) + (1U << nplen);
799 if (end < be32toh(start))
800 end = 0U;
801 end = htobe32(end);
802
803 r = sd_nfnl_nft_message_add_setelem(m, 1, &end, sizeof(end), NULL, 0);
804 if (r < 0)
805 return r;
806
807 r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
808 if (r < 0)
809 return r;
810
811 r = sd_nfnl_nft_message_add_setelem_end(m);
812 if (r < 0)
813 return r;
814
815 return 0;
816 }
817
nft_message_add_setelem_ip6range(sd_netlink_message * m,const union in_addr_union * source,unsigned int prefixlen)818 static int nft_message_add_setelem_ip6range(
819 sd_netlink_message *m,
820 const union in_addr_union *source,
821 unsigned int prefixlen) {
822
823 union in_addr_union start, end;
824 int r;
825
826 r = in_addr_prefix_range(AF_INET6, source, prefixlen, &start, &end);
827 if (r < 0)
828 return r;
829
830 r = sd_nfnl_nft_message_add_setelem(m, 0, &start.in6, sizeof(start.in6), NULL, 0);
831 if (r < 0)
832 return r;
833
834 r = sd_nfnl_nft_message_add_setelem_end(m);
835 if (r < 0)
836 return r;
837
838 r = sd_nfnl_nft_message_add_setelem(m, 1, &end.in6, sizeof(end.in6), NULL, 0);
839 if (r < 0)
840 return r;
841
842 r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
843 if (r < 0)
844 return r;
845
846 return sd_nfnl_nft_message_add_setelem_end(m);
847 }
848
849 #define NFT_MASQ_MSGS 3
850
fw_nftables_add_masquerade_internal(FirewallContext * ctx,bool add,int af,const union in_addr_union * source,unsigned int source_prefixlen)851 static int fw_nftables_add_masquerade_internal(
852 FirewallContext *ctx,
853 bool add,
854 int af,
855 const union in_addr_union *source,
856 unsigned int source_prefixlen) {
857
858 sd_netlink_message *transaction[NFT_MASQ_MSGS] = {};
859 size_t tsize;
860 int r;
861
862 if (!source || source_prefixlen == 0)
863 return -EINVAL;
864
865 if (af == AF_INET6 && source_prefixlen < 8)
866 return -EINVAL;
867
868 r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
869 if (r < 0)
870 return r;
871 tsize = 1;
872 if (add)
873 r = sd_nfnl_nft_message_new_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
874 else
875 r = sd_nfnl_nft_message_del_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
876 if (r < 0)
877 goto out_unref;
878
879 if (af == AF_INET)
880 r = nft_message_add_setelem_iprange(transaction[tsize], source, source_prefixlen);
881 else
882 r = nft_message_add_setelem_ip6range(transaction[tsize], source, source_prefixlen);
883 if (r < 0)
884 goto out_unref;
885
886 ++tsize;
887 assert(tsize < NFT_MASQ_MSGS);
888 r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
889 if (r < 0)
890 return r;
891
892 ++tsize;
893 r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
894
895 out_unref:
896 while (tsize > 0)
897 sd_netlink_message_unref(transaction[--tsize]);
898 return r < 0 ? r : 0;
899 }
900
fw_nftables_add_masquerade(FirewallContext * ctx,bool add,int af,const union in_addr_union * source,unsigned int source_prefixlen)901 int fw_nftables_add_masquerade(
902 FirewallContext *ctx,
903 bool add,
904 int af,
905 const union in_addr_union *source,
906 unsigned int source_prefixlen) {
907
908 int r;
909
910 if (!socket_ipv6_is_supported() && af == AF_INET6)
911 return -EOPNOTSUPP;
912
913 r = fw_nftables_add_masquerade_internal(ctx, add, af, source, source_prefixlen);
914 if (r != -ENOENT)
915 return r;
916
917 /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
918 * systemd nat table.
919 *
920 * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
921 * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
922 * 'just work'.
923 *
924 * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
925 *
926 * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
927 * operation.
928 *
929 * Note that this doesn't protect against external sabotage such as a
930 * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
931 * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
932 * non-deleteable except by the 'owning process'. */
933
934 r = fw_nftables_init_family(ctx->nfnl, af);
935 if (r < 0)
936 return r;
937
938 return fw_nftables_add_masquerade_internal(ctx, add, af, source, source_prefixlen);
939 }
940
941 #define NFT_DNAT_MSGS 4
942
fw_nftables_add_local_dnat_internal(FirewallContext * ctx,bool add,int af,int protocol,uint16_t local_port,const union in_addr_union * remote,uint16_t remote_port,const union in_addr_union * previous_remote)943 static int fw_nftables_add_local_dnat_internal(
944 FirewallContext *ctx,
945 bool add,
946 int af,
947 int protocol,
948 uint16_t local_port,
949 const union in_addr_union *remote,
950 uint16_t remote_port,
951 const union in_addr_union *previous_remote) {
952
953 sd_netlink_message *transaction[NFT_DNAT_MSGS] = {};
954 static bool ipv6_supported = true;
955 uint32_t data[5], key[2], dlen;
956 size_t tsize;
957 int r;
958
959 assert(add || !previous_remote);
960
961 if (!ipv6_supported && af == AF_INET6)
962 return -EOPNOTSUPP;
963
964 if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
965 return -EPROTONOSUPPORT;
966
967 if (local_port <= 0)
968 return -EINVAL;
969
970 key[0] = protocol;
971 key[1] = htobe16(local_port);
972
973 if (!remote)
974 return -EOPNOTSUPP;
975
976 if (remote_port <= 0)
977 return -EINVAL;
978
979 if (af == AF_INET) {
980 dlen = 8;
981 data[1] = htobe16(remote_port);
982 } else {
983 assert(af == AF_INET6);
984 dlen = sizeof(data);
985 data[4] = htobe16(remote_port);
986 }
987
988 r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
989 if (r < 0)
990 return r;
991
992 tsize = 1;
993 /* If a previous remote is set, remove its entry */
994 if (add && previous_remote && !in_addr_equal(af, previous_remote, remote)) {
995 if (af == AF_INET)
996 data[0] = previous_remote->in.s_addr;
997 else
998 memcpy(data, &previous_remote->in6, sizeof(previous_remote->in6));
999
1000 r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1001 if (r < 0)
1002 goto out_unref;
1003
1004 tsize++;
1005 }
1006
1007 if (af == AF_INET)
1008 data[0] = remote->in.s_addr;
1009 else
1010 memcpy(data, &remote->in6, sizeof(remote->in6));
1011
1012 assert(tsize < NFT_DNAT_MSGS);
1013 if (add)
1014 r = nft_add_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1015 else
1016 r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1017 if (r < 0)
1018 goto out_unref;
1019
1020 tsize++;
1021 assert(tsize < NFT_DNAT_MSGS);
1022
1023 r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
1024 if (r < 0)
1025 goto out_unref;
1026
1027 tsize++;
1028 assert(tsize <= NFT_DNAT_MSGS);
1029
1030 r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
1031 if (r == -EOVERFLOW && af == AF_INET6) {
1032 /* The current implementation of DNAT in systemd requires kernel's
1033 * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
1034 * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
1035 log_debug_errno(r, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
1036 ipv6_supported = false;
1037 r = -EOPNOTSUPP;
1038 }
1039
1040 out_unref:
1041 while (tsize > 0)
1042 sd_netlink_message_unref(transaction[--tsize]);
1043
1044 return r < 0 ? r : 0;
1045 }
1046
fw_nftables_add_local_dnat(FirewallContext * ctx,bool add,int af,int protocol,uint16_t local_port,const union in_addr_union * remote,uint16_t remote_port,const union in_addr_union * previous_remote)1047 int fw_nftables_add_local_dnat(
1048 FirewallContext *ctx,
1049 bool add,
1050 int af,
1051 int protocol,
1052 uint16_t local_port,
1053 const union in_addr_union *remote,
1054 uint16_t remote_port,
1055 const union in_addr_union *previous_remote) {
1056
1057 int r;
1058
1059 if (!socket_ipv6_is_supported() && af == AF_INET6)
1060 return -EOPNOTSUPP;
1061
1062 r = fw_nftables_add_local_dnat_internal(ctx, add, af, protocol, local_port, remote, remote_port, previous_remote);
1063 if (r != -ENOENT)
1064 return r;
1065
1066 /* See comment in fw_nftables_add_masquerade(). */
1067 r = fw_nftables_init_family(ctx->nfnl, af);
1068 if (r < 0)
1069 return r;
1070
1071 /* table created anew; previous address already gone */
1072 return fw_nftables_add_local_dnat_internal(ctx, add, af, protocol, local_port, remote, remote_port, NULL);
1073 }
1074