1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2 
3 #include <arpa/inet.h>
4 #include <endian.h>
5 #include <errno.h>
6 #include <stddef.h>
7 #include <string.h>
8 #include <linux/netfilter/nf_tables.h>
9 #include <linux/netfilter/nf_nat.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <netinet/ip.h>
12 #include <netinet/ip6.h>
13 
14 #include "sd-netlink.h"
15 
16 #include "alloc-util.h"
17 #include "firewall-util.h"
18 #include "firewall-util-private.h"
19 #include "in-addr-util.h"
20 #include "macro.h"
21 #include "socket-util.h"
22 #include "time-util.h"
23 
24 #define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
25 #define NFT_SYSTEMD_TABLE_NAME   "io.systemd.nat"
26 #define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
27 
28 #define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
29 
30 #define UDP_DPORT_OFFSET 2
31 
nfnl_netlink_sendv(sd_netlink * nfnl,sd_netlink_message * messages[static1],size_t msgcount)32 static int nfnl_netlink_sendv(
33                 sd_netlink *nfnl,
34                 sd_netlink_message *messages[static 1],
35                 size_t msgcount) {
36 
37         _cleanup_free_ uint32_t *serial = NULL;
38         int r;
39 
40         assert(nfnl);
41         assert(messages);
42         assert(msgcount > 0);
43 
44         r = sd_netlink_sendv(nfnl, messages, msgcount, &serial);
45         if (r < 0)
46                 return r;
47 
48         r = 0;
49         for (size_t i = 1; i < msgcount - 1; i++) {
50                 int tmp;
51 
52                 /* If message is an error, this returns embedded errno */
53                 tmp = sd_netlink_read(nfnl, serial[i], NFNL_DEFAULT_TIMEOUT_USECS, NULL);
54                 if (tmp < 0 && r == 0)
55                         r = tmp;
56         }
57 
58         return r;
59 }
60 
nfnl_add_open_expr_container(sd_netlink_message * m,const char * name)61 static int nfnl_add_open_expr_container(sd_netlink_message *m, const char *name) {
62         int r;
63 
64         r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
65         if (r < 0)
66                 return r;
67 
68         r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, name);
69         if (r < 0)
70                 return r;
71 
72         return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name);
73 }
74 
nfnl_add_expr_fib(sd_netlink_message * m,uint32_t nft_fib_flags,enum nft_fib_result result,enum nft_registers dreg)75 static int nfnl_add_expr_fib(sd_netlink_message *m, uint32_t nft_fib_flags,
76                              enum nft_fib_result result,
77                              enum nft_registers dreg) {
78         int r;
79 
80         r = nfnl_add_open_expr_container(m, "fib");
81         if (r < 0)
82                 return r;
83 
84         r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags));
85         if (r < 0)
86                 return r;
87         r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result));
88         if (r < 0)
89                 return r;
90         r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg));
91         if (r < 0)
92                 return r;
93 
94         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
95         if (r < 0)
96                 return r;
97 
98         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
99 }
100 
nfnl_add_expr_meta(sd_netlink_message * m,enum nft_meta_keys key,enum nft_registers dreg)101 static int nfnl_add_expr_meta(sd_netlink_message *m, enum nft_meta_keys key,
102                               enum nft_registers dreg) {
103         int r;
104 
105         r = nfnl_add_open_expr_container(m, "meta");
106         if (r < 0)
107                 return r;
108 
109         r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key));
110         if (r < 0)
111                 return r;
112 
113         r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg));
114         if (r < 0)
115                 return r;
116 
117         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
118         if (r < 0)
119                 return r;
120 
121         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
122 }
123 
nfnl_add_expr_payload(sd_netlink_message * m,enum nft_payload_bases pb,uint32_t offset,uint32_t len,enum nft_registers dreg)124 static int nfnl_add_expr_payload(sd_netlink_message *m, enum nft_payload_bases pb,
125                                  uint32_t offset, uint32_t len, enum nft_registers dreg) {
126         int r;
127 
128         r = nfnl_add_open_expr_container(m, "payload");
129         if (r < 0)
130                 return r;
131 
132         r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg));
133         if (r < 0)
134                 return r;
135         r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb));
136         if (r < 0)
137                 return r;
138         r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset));
139         if (r < 0)
140                 return r;
141         r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len));
142         if (r < 0)
143                 return r;
144 
145         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
146         if (r < 0)
147                 return r;
148         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
149 }
150 
nfnl_add_expr_lookup_set_data(sd_netlink_message * m,const char * set_name,enum nft_registers sreg)151 static int nfnl_add_expr_lookup_set_data(sd_netlink_message *m, const char *set_name,
152                                          enum nft_registers sreg) {
153         int r;
154 
155         r = nfnl_add_open_expr_container(m, "lookup");
156         if (r < 0)
157                 return r;
158 
159         r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name);
160         if (r < 0)
161                 return r;
162 
163         return sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg));
164 }
165 
nfnl_add_expr_lookup_set(sd_netlink_message * m,const char * set_name,enum nft_registers sreg)166 static int nfnl_add_expr_lookup_set(sd_netlink_message *m, const char *set_name,
167                                     enum nft_registers sreg) {
168         int r;
169 
170         r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
171         if (r < 0)
172                 return r;
173 
174         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
175         if (r < 0)
176                 return r;
177         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
178 }
179 
nfnl_add_expr_lookup_map(sd_netlink_message * m,const char * set_name,enum nft_registers sreg,enum nft_registers dreg)180 static int nfnl_add_expr_lookup_map(sd_netlink_message *m, const char *set_name,
181                                     enum nft_registers sreg, enum nft_registers dreg) {
182         int r;
183 
184         r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
185         if (r < 0)
186                 return r;
187 
188         r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg));
189         if (r < 0)
190                 return r;
191 
192         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
193         if (r < 0)
194                 return r;
195 
196         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
197 }
198 
nfnl_add_expr_data(sd_netlink_message * m,int attr,const void * data,uint32_t dlen)199 static int nfnl_add_expr_data(sd_netlink_message *m, int attr, const void *data, uint32_t dlen) {
200         int r;
201 
202         r = sd_netlink_message_open_container(m, attr);
203         if (r < 0)
204                 return r;
205         r = sd_netlink_message_append_data(m, NFTA_DATA_VALUE, data, dlen);
206         if (r < 0)
207                 return r;
208 
209         return sd_netlink_message_close_container(m); /* attr */
210 }
211 
nfnl_add_expr_cmp_data(sd_netlink_message * m,const void * data,uint32_t dlen)212 static int nfnl_add_expr_cmp_data(sd_netlink_message *m, const void *data, uint32_t dlen) {
213         return nfnl_add_expr_data(m, NFTA_CMP_DATA, data, dlen);
214 }
215 
nfnl_add_expr_cmp(sd_netlink_message * m,enum nft_cmp_ops cmp_op,enum nft_registers sreg,const void * data,uint32_t dlen)216 static int nfnl_add_expr_cmp(sd_netlink_message *m, enum nft_cmp_ops cmp_op,
217                              enum nft_registers sreg, const void *data, uint32_t dlen) {
218         int r;
219 
220         r = nfnl_add_open_expr_container(m, "cmp");
221         if (r < 0)
222                 return r;
223 
224         r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op));
225         if (r < 0)
226                 return r;
227         r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg));
228         if (r < 0)
229                 return r;
230 
231         r = nfnl_add_expr_cmp_data(m, data, dlen);
232         if (r < 0)
233                 return r;
234 
235         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
236         if (r < 0)
237                 return r;
238         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
239 }
240 
nfnl_add_expr_bitwise(sd_netlink_message * m,enum nft_registers sreg,enum nft_registers dreg,const void * and,const void * xor,uint32_t len)241 static int nfnl_add_expr_bitwise(sd_netlink_message *m,
242                                  enum nft_registers sreg,
243                                  enum nft_registers dreg,
244                                  const void *and,
245                                  const void *xor, uint32_t len) {
246         int r;
247 
248         r = nfnl_add_open_expr_container(m, "bitwise");
249         if (r < 0)
250                 return r;
251 
252         r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg));
253         if (r < 0)
254                 return r;
255         r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg));
256         if (r < 0)
257                 return r;
258         r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len));
259         if (r < 0)
260                 return r;
261 
262         r = nfnl_add_expr_data(m, NFTA_BITWISE_MASK, and, len);
263         if (r < 0)
264                 return r;
265 
266         r = nfnl_add_expr_data(m, NFTA_BITWISE_XOR, xor, len);
267         if (r < 0)
268                 return r;
269 
270         r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
271         if (r < 0)
272                 return r;
273         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
274 }
275 
nfnl_add_expr_dnat(sd_netlink_message * m,int family,enum nft_registers areg,enum nft_registers preg)276 static int nfnl_add_expr_dnat(sd_netlink_message *m,
277                               int family,
278                               enum nft_registers areg,
279                               enum nft_registers preg) {
280         int r;
281 
282         r = nfnl_add_open_expr_container(m, "nat");
283         if (r < 0)
284                 return r;
285 
286         r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT));
287         if (r < 0)
288                 return r;
289 
290         r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family));
291         if (r < 0)
292                 return r;
293 
294         r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg));
295         if (r < 0)
296                 return r;
297         r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg));
298         if (r < 0)
299                 return r;
300         r = sd_netlink_message_close_container(m);
301         if (r < 0)
302                 return r;
303 
304         return sd_netlink_message_close_container(m);
305 }
306 
nfnl_add_expr_masq(sd_netlink_message * m)307 static int nfnl_add_expr_masq(sd_netlink_message *m) {
308         int r;
309 
310         r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
311         if (r < 0)
312                 return r;
313 
314         r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq");
315         if (r < 0)
316                 return r;
317 
318         return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
319 }
320 
sd_nfnl_message_new_masq_rule(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * chain)321 static int sd_nfnl_message_new_masq_rule(sd_netlink *nfnl, sd_netlink_message **ret, int family,
322                                          const char *chain) {
323         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
324         int r;
325 
326         /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
327 
328         r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
329         if (r < 0)
330                 return r;
331 
332         r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
333         if (r < 0)
334                 return r;
335 
336         /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
337         if (family == AF_INET)
338                 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr),
339                                           sizeof(uint32_t), NFT_REG32_01);
340         else
341                 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_src.s6_addr),
342                                           sizeof(struct in6_addr), NFT_REG32_01);
343         if (r < 0)
344                 return r;
345 
346         /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
347         r = nfnl_add_expr_lookup_set(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01);
348         if (r < 0)
349                 return r;
350 
351         /* 2nd statement: masq.  Only executed by kernel if the previous lookup was successful. */
352         r = nfnl_add_expr_masq(m);
353         if (r < 0)
354                 return r;
355 
356         r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
357         if (r < 0)
358                 return r;
359         *ret = TAKE_PTR(m);
360         return 0;
361 }
362 
sd_nfnl_message_new_dnat_rule_pre(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * chain)363 static int sd_nfnl_message_new_dnat_rule_pre(sd_netlink *nfnl, sd_netlink_message **ret, int family,
364                                              const char *chain) {
365         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
366         enum nft_registers proto_reg;
367         uint32_t local = RTN_LOCAL;
368         int r;
369 
370         /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
371          * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
372 
373         r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
374         if (r < 0)
375                 return r;
376 
377         r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
378         if (r < 0)
379                 return r;
380 
381         /* 1st statement: fib daddr type local */
382         r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01);
383         if (r < 0)
384                 return r;
385 
386         /* 1st statement (cont.): compare RTN_LOCAL */
387         r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local));
388         if (r < 0)
389                 return r;
390 
391         /* 2nd statement: lookup local port in map, fetch address:dport to map to */
392         r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
393         if (r < 0)
394                 return r;
395 
396         r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
397                                   sizeof(uint16_t), NFT_REG32_02);
398         if (r < 0)
399                 return r;
400 
401         /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
402          * store address and port for the dnat mapping in REG1/REG2.
403         */
404         r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
405         if (r < 0)
406                 return r;
407 
408         proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
409         r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
410         if (r < 0)
411                 return r;
412 
413         r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
414         if (r < 0)
415                 return r;
416         *ret = TAKE_PTR(m);
417         return 0;
418 }
419 
sd_nfnl_message_new_dnat_rule_out(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * chain)420 static int sd_nfnl_message_new_dnat_rule_out(sd_netlink *nfnl, sd_netlink_message **ret,
421                                              int family, const char *chain) {
422         static const uint32_t zero = 0, one = 1;
423 
424         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
425         enum nft_registers proto_reg;
426         int r;
427 
428         r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
429         if (r < 0)
430                 return r;
431 
432         r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
433         if (r < 0)
434                 return r;
435 
436         /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
437         if (family == AF_INET) {
438                 uint32_t lonet = htobe32(UINT32_C(0x7F000000)), lomask = htobe32(UINT32_C(0xff000000));
439 
440                 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr),
441                                           sizeof(lonet), NFT_REG32_01);
442                 if (r < 0)
443                         return r;
444                 /* 1st statement (cont.): bitops/prefix */
445                 r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask));
446                 if (r < 0)
447                         return r;
448 
449                 /* 1st statement (cont.): compare reg1 with 127/8 */
450                 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet));
451         } else {
452                 struct in6_addr loaddr = IN6ADDR_LOOPBACK_INIT;
453 
454                 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_dst.s6_addr),
455                                           sizeof(loaddr), NFT_REG32_01);
456                 if (r < 0)
457                         return r;
458 
459                 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &loaddr, sizeof(loaddr));
460         }
461         if (r < 0)
462                 return r;
463 
464         /* 2nd statement: meta oif lo */
465         r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01);
466         if (r < 0)
467                 return r;
468 
469         /* 2nd statement (cont.): compare to lo ifindex (1) */
470         r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one));
471         if (r < 0)
472                 return r;
473 
474         /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
475         r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
476         if (r < 0)
477                 return r;
478 
479         /* 3rd statement (cont): store the port number in reg2 */
480         r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
481                                   sizeof(uint16_t), NFT_REG32_02);
482         if (r < 0)
483                 return r;
484 
485         /* 3rd statement (cont): use reg1 and reg2 and retrieve
486          * the new destination ip and port number.
487          *
488          * reg1 and reg2 are clobbered and will then contain the new
489          * address/port number.
490          */
491         r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
492         if (r < 0)
493                 return r;
494 
495         /* 4th statement: dnat connection to address/port retrieved by the
496          * preceding expression. */
497         proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
498         r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
499         if (r < 0)
500                 return r;
501 
502         r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
503         if (r < 0)
504                 return r;
505         *ret = TAKE_PTR(m);
506         return 0;
507 }
508 
nft_new_set(struct sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,uint32_t set_id,uint32_t flags,uint32_t type,uint32_t klen)509 static int nft_new_set(struct sd_netlink *nfnl,
510                        sd_netlink_message **ret,
511                        int family, const char *set_name,
512                        uint32_t set_id,
513                        uint32_t flags, uint32_t type, uint32_t klen) {
514         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
515         int r;
516 
517         r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen);
518         if (r < 0)
519                 return r;
520 
521         if (flags != 0) {
522                 r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags));
523                 if (r < 0)
524                         return r;
525         }
526 
527         r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type));
528         if (r < 0)
529                 return r;
530 
531         *ret = TAKE_PTR(m);
532         return r;
533 }
534 
nft_new_map(struct sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,uint32_t set_id,uint32_t flags,uint32_t type,uint32_t klen,uint32_t dtype,uint32_t dlen)535 static int nft_new_map(struct sd_netlink *nfnl,
536                        sd_netlink_message **ret,
537                        int family, const char *set_name, uint32_t set_id,
538                        uint32_t flags, uint32_t type, uint32_t klen, uint32_t dtype, uint32_t dlen) {
539         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
540         int r;
541 
542         r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen);
543         if (r < 0)
544                 return r;
545 
546         r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype));
547         if (r < 0)
548                return r;
549 
550         r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen));
551         if (r < 0)
552                 return r;
553         *ret = TAKE_PTR(m);
554         return 0;
555 }
556 
nft_add_element(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,const void * key,uint32_t klen,const void * data,uint32_t dlen)557 static int nft_add_element(sd_netlink *nfnl, sd_netlink_message **ret,
558                            int family, const char *set_name,
559                            const void *key, uint32_t klen,
560                            const void *data, uint32_t dlen) {
561         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
562         int r;
563 
564         /*
565          * Ideally there would be an API that provides:
566          *
567          * 1) an init function to add the main ruleset skeleton
568          * 2) a function that populates the sets with all known address/port pairs to s/dnat for
569          * 3) a function that can remove address/port pairs again.
570          *
571          * At this time, the existing API is used which is built on a
572          * 'add/delete a rule' paradigm.
573          *
574          * This replicated here and each element gets added to the set
575          * one-by-one.
576          */
577         r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
578         if (r < 0)
579                 return r;
580 
581         r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
582         if (r < 0)
583                 return r;
584 
585         /* could theoretically append more set elements to add here */
586         r = sd_nfnl_nft_message_add_setelem_end(m);
587         if (r < 0)
588                 return r;
589         *ret = TAKE_PTR(m);
590         return 0;
591 }
592 
nft_del_element(sd_netlink * nfnl,sd_netlink_message ** ret,int family,const char * set_name,const void * key,uint32_t klen,const void * data,uint32_t dlen)593 static int nft_del_element(sd_netlink *nfnl,
594                            sd_netlink_message **ret, int family, const char *set_name,
595                            const void *key, uint32_t klen,
596                            const void *data, uint32_t dlen) {
597         _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
598         int r;
599 
600         r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
601         if (r < 0)
602                return r;
603 
604         r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
605         if (r < 0)
606                return r;
607 
608         r = sd_nfnl_nft_message_add_setelem_end(m);
609         if (r < 0)
610                 return r;
611         *ret = TAKE_PTR(m);
612         return 0;
613 }
614 
615 /* This is needed so 'nft' userspace tool can properly format the contents
616  * of the set/map when someone uses 'nft' to inspect their content.
617  *
618  * The values cannot be changed, they are part of the nft tool type identifier ABI.
619  */
620 #define TYPE_BITS 6
621 
622 enum nft_key_types {
623         TYPE_IPADDR = 7,
624         TYPE_IP6ADDR = 8,
625         TYPE_INET_PROTOCOL = 12,
626         TYPE_INET_SERVICE = 13,
627 };
628 
concat_types2(enum nft_key_types a,enum nft_key_types b)629 static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) {
630         uint32_t type = (uint32_t)a;
631 
632         type <<= TYPE_BITS;
633         type |= (uint32_t)b;
634 
635         return type;
636 }
637 
638 /* enough space to hold netlink messages for table skeleton */
639 #define NFT_INIT_MSGS 16
fw_nftables_init_family(sd_netlink * nfnl,int family)640 static int fw_nftables_init_family(sd_netlink *nfnl, int family) {
641         sd_netlink_message *batch[NFT_INIT_MSGS] = {};
642         size_t msgcnt = 0, i, ip_type_size;
643         uint32_t set_id = 0;
644         int ip_type, r;
645 
646         assert(IN_SET(family, AF_INET, AF_INET6));
647 
648         r = sd_nfnl_message_batch_begin(nfnl, &batch[msgcnt]);
649         if (r < 0)
650                 goto out_unref;
651 
652         msgcnt++;
653         assert(msgcnt < NFT_INIT_MSGS);
654         /* Set F_EXCL so table add fails if the table already exists. */
655         r = sd_nfnl_nft_message_new_table(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME);
656         if (r < 0)
657                 goto out_unref;
658 
659         msgcnt++;
660         assert(msgcnt < NFT_INIT_MSGS);
661 
662         r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
663                                               "prerouting", "nat",
664                                               NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1);
665         if (r < 0)
666                 goto out_unref;
667 
668         msgcnt++;
669         assert(msgcnt < NFT_INIT_MSGS);
670         r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
671                                               "output", "nat",
672                                               NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1);
673         if (r < 0)
674                 goto out_unref;
675 
676         msgcnt++;
677         assert(msgcnt < NFT_INIT_MSGS);
678         r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
679                                               "postrouting", "nat",
680                                               NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1);
681         if (r < 0)
682                 goto out_unref;
683 
684         if (family == AF_INET) {
685                 ip_type_size = sizeof(uint32_t);
686                 ip_type = TYPE_IPADDR;
687         } else {
688                 assert(family == AF_INET6);
689                 ip_type_size = sizeof(struct in6_addr);
690                 ip_type = TYPE_IP6ADDR;
691         }
692         msgcnt++;
693         assert(msgcnt < NFT_INIT_MSGS);
694         /* set to store ip address ranges we should masquerade for */
695         r = nft_new_set(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size);
696         if (r < 0)
697                 goto out_unref;
698 
699         /*
700          * map to store ip address:port pair to dnat to.  elements in concatenation
701          * are rounded up to 4 bytes.
702          *
703          * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
704          * sizeof(uint8_t) + sizeof(uint16_t).
705          */
706         msgcnt++;
707         assert(msgcnt < NFT_INIT_MSGS);
708         r = nft_new_map(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0,
709                         concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2,
710                         concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t));
711         if (r < 0)
712                 goto out_unref;
713 
714         msgcnt++;
715         assert(msgcnt < NFT_INIT_MSGS);
716         r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &batch[msgcnt], family, "prerouting");
717         if (r < 0)
718                 goto out_unref;
719 
720         msgcnt++;
721         assert(msgcnt < NFT_INIT_MSGS);
722         r = sd_nfnl_message_new_dnat_rule_out(nfnl, &batch[msgcnt], family, "output");
723         if (r < 0)
724                 goto out_unref;
725 
726         msgcnt++;
727         r = sd_nfnl_message_new_masq_rule(nfnl, &batch[msgcnt], family, "postrouting");
728         if (r < 0)
729                 goto out_unref;
730 
731         msgcnt++;
732         assert(msgcnt < NFT_INIT_MSGS);
733         r = sd_nfnl_message_batch_end(nfnl, &batch[msgcnt]);
734         if (r < 0)
735                 goto out_unref;
736 
737         msgcnt++;
738         assert(msgcnt <= NFT_INIT_MSGS);
739         r = nfnl_netlink_sendv(nfnl, batch, msgcnt);
740         if (r == -EEXIST)
741                 r = 0;
742 
743 out_unref:
744         for (i = 0; i < msgcnt; i++)
745                 sd_netlink_message_unref(batch[i]);
746 
747         return r;
748 }
749 
fw_nftables_init(FirewallContext * ctx)750 int fw_nftables_init(FirewallContext *ctx) {
751         _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
752         int r;
753 
754         r = sd_nfnl_socket_open(&nfnl);
755         if (r < 0)
756                 return r;
757 
758         r = fw_nftables_init_family(nfnl, AF_INET);
759         if (r < 0)
760                 return r;
761 
762         if (socket_ipv6_is_supported()) {
763                 r = fw_nftables_init_family(nfnl, AF_INET6);
764                 if (r < 0)
765                         log_debug_errno(r, "Failed to init ipv6 NAT: %m");
766         }
767 
768         ctx->nfnl = TAKE_PTR(nfnl);
769         return 0;
770 }
771 
fw_nftables_exit(FirewallContext * ctx)772 void fw_nftables_exit(FirewallContext *ctx) {
773         ctx->nfnl = sd_netlink_unref(ctx->nfnl);
774 }
775 
nft_message_add_setelem_iprange(sd_netlink_message * m,const union in_addr_union * source,unsigned int prefixlen)776 static int nft_message_add_setelem_iprange(sd_netlink_message *m,
777                                            const union in_addr_union *source,
778                                            unsigned int prefixlen) {
779         uint32_t mask, start, end;
780         unsigned int nplen;
781         int r;
782 
783         assert(prefixlen <= 32);
784         nplen = 32 - prefixlen;
785 
786         mask = (1U << nplen) - 1U;
787         mask = htobe32(~mask);
788         start = source->in.s_addr & mask;
789 
790         r = sd_nfnl_nft_message_add_setelem(m, 0, &start, sizeof(start), NULL, 0);
791         if (r < 0)
792                 return r;
793 
794         r = sd_nfnl_nft_message_add_setelem_end(m);
795         if (r < 0)
796                 return r;
797 
798         end = be32toh(start) + (1U << nplen);
799         if (end < be32toh(start))
800                 end = 0U;
801         end = htobe32(end);
802 
803         r = sd_nfnl_nft_message_add_setelem(m, 1, &end, sizeof(end), NULL, 0);
804         if (r < 0)
805                 return r;
806 
807         r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
808         if (r < 0)
809                 return r;
810 
811         r = sd_nfnl_nft_message_add_setelem_end(m);
812         if (r < 0)
813                 return r;
814 
815         return 0;
816 }
817 
nft_message_add_setelem_ip6range(sd_netlink_message * m,const union in_addr_union * source,unsigned int prefixlen)818 static int nft_message_add_setelem_ip6range(
819                 sd_netlink_message *m,
820                 const union in_addr_union *source,
821                 unsigned int prefixlen) {
822 
823         union in_addr_union start, end;
824         int r;
825 
826         r = in_addr_prefix_range(AF_INET6, source, prefixlen, &start, &end);
827         if (r < 0)
828                 return r;
829 
830         r = sd_nfnl_nft_message_add_setelem(m, 0, &start.in6, sizeof(start.in6), NULL, 0);
831         if (r < 0)
832                 return r;
833 
834         r = sd_nfnl_nft_message_add_setelem_end(m);
835         if (r < 0)
836                 return r;
837 
838         r = sd_nfnl_nft_message_add_setelem(m, 1, &end.in6, sizeof(end.in6), NULL, 0);
839         if (r < 0)
840                 return r;
841 
842         r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
843         if (r < 0)
844                 return r;
845 
846         return sd_nfnl_nft_message_add_setelem_end(m);
847 }
848 
849 #define NFT_MASQ_MSGS   3
850 
fw_nftables_add_masquerade_internal(FirewallContext * ctx,bool add,int af,const union in_addr_union * source,unsigned int source_prefixlen)851 static int fw_nftables_add_masquerade_internal(
852                 FirewallContext *ctx,
853                 bool add,
854                 int af,
855                 const union in_addr_union *source,
856                 unsigned int source_prefixlen) {
857 
858         sd_netlink_message *transaction[NFT_MASQ_MSGS] = {};
859         size_t tsize;
860         int r;
861 
862         if (!source || source_prefixlen == 0)
863                 return -EINVAL;
864 
865         if (af == AF_INET6 && source_prefixlen < 8)
866                 return -EINVAL;
867 
868         r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
869         if (r < 0)
870                 return r;
871         tsize = 1;
872         if (add)
873                 r = sd_nfnl_nft_message_new_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
874         else
875                 r = sd_nfnl_nft_message_del_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
876         if (r < 0)
877                 goto out_unref;
878 
879         if (af == AF_INET)
880                  r = nft_message_add_setelem_iprange(transaction[tsize], source, source_prefixlen);
881         else
882                  r = nft_message_add_setelem_ip6range(transaction[tsize], source, source_prefixlen);
883         if (r < 0)
884                 goto out_unref;
885 
886         ++tsize;
887         assert(tsize < NFT_MASQ_MSGS);
888         r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
889         if (r < 0)
890                 return r;
891 
892         ++tsize;
893         r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
894 
895 out_unref:
896         while (tsize > 0)
897                 sd_netlink_message_unref(transaction[--tsize]);
898         return r < 0 ? r : 0;
899 }
900 
fw_nftables_add_masquerade(FirewallContext * ctx,bool add,int af,const union in_addr_union * source,unsigned int source_prefixlen)901 int fw_nftables_add_masquerade(
902                 FirewallContext *ctx,
903                 bool add,
904                 int af,
905                 const union in_addr_union *source,
906                 unsigned int source_prefixlen) {
907 
908         int r;
909 
910         if (!socket_ipv6_is_supported() && af == AF_INET6)
911                 return -EOPNOTSUPP;
912 
913         r = fw_nftables_add_masquerade_internal(ctx, add, af, source, source_prefixlen);
914         if (r != -ENOENT)
915                 return r;
916 
917         /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
918          * systemd nat table.
919          *
920          * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
921          * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
922          * 'just work'.
923          *
924          * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
925          *
926          * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
927          * operation.
928          *
929          * Note that this doesn't protect against external sabotage such as a
930          * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
931          * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
932          * non-deleteable except by the 'owning process'. */
933 
934         r = fw_nftables_init_family(ctx->nfnl, af);
935         if (r < 0)
936                 return r;
937 
938         return fw_nftables_add_masquerade_internal(ctx, add, af, source, source_prefixlen);
939 }
940 
941 #define NFT_DNAT_MSGS   4
942 
fw_nftables_add_local_dnat_internal(FirewallContext * ctx,bool add,int af,int protocol,uint16_t local_port,const union in_addr_union * remote,uint16_t remote_port,const union in_addr_union * previous_remote)943 static int fw_nftables_add_local_dnat_internal(
944                 FirewallContext *ctx,
945                 bool add,
946                 int af,
947                 int protocol,
948                 uint16_t local_port,
949                 const union in_addr_union *remote,
950                 uint16_t remote_port,
951                 const union in_addr_union *previous_remote) {
952 
953         sd_netlink_message *transaction[NFT_DNAT_MSGS] = {};
954         static bool ipv6_supported = true;
955         uint32_t data[5], key[2], dlen;
956         size_t tsize;
957         int r;
958 
959         assert(add || !previous_remote);
960 
961         if (!ipv6_supported && af == AF_INET6)
962                 return -EOPNOTSUPP;
963 
964         if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
965                 return -EPROTONOSUPPORT;
966 
967         if (local_port <= 0)
968                 return -EINVAL;
969 
970         key[0] = protocol;
971         key[1] = htobe16(local_port);
972 
973         if (!remote)
974                 return -EOPNOTSUPP;
975 
976         if (remote_port <= 0)
977                 return -EINVAL;
978 
979         if (af == AF_INET) {
980                 dlen = 8;
981                 data[1] = htobe16(remote_port);
982         } else {
983                 assert(af == AF_INET6);
984                 dlen = sizeof(data);
985                 data[4] = htobe16(remote_port);
986         }
987 
988         r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
989         if (r < 0)
990                 return r;
991 
992         tsize = 1;
993         /* If a previous remote is set, remove its entry */
994         if (add && previous_remote && !in_addr_equal(af, previous_remote, remote)) {
995                 if (af == AF_INET)
996                         data[0] = previous_remote->in.s_addr;
997                 else
998                         memcpy(data, &previous_remote->in6, sizeof(previous_remote->in6));
999 
1000                 r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1001                 if (r < 0)
1002                         goto out_unref;
1003 
1004                 tsize++;
1005         }
1006 
1007         if (af == AF_INET)
1008                 data[0] = remote->in.s_addr;
1009         else
1010                 memcpy(data, &remote->in6, sizeof(remote->in6));
1011 
1012         assert(tsize < NFT_DNAT_MSGS);
1013         if (add)
1014                 r = nft_add_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1015         else
1016                 r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1017         if (r < 0)
1018                 goto out_unref;
1019 
1020         tsize++;
1021         assert(tsize < NFT_DNAT_MSGS);
1022 
1023         r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
1024         if (r < 0)
1025                 goto out_unref;
1026 
1027         tsize++;
1028         assert(tsize <= NFT_DNAT_MSGS);
1029 
1030         r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
1031         if (r == -EOVERFLOW && af == AF_INET6) {
1032                 /* The current implementation of DNAT in systemd requires kernel's
1033                  * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
1034                  * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
1035                 log_debug_errno(r, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
1036                 ipv6_supported = false;
1037                 r = -EOPNOTSUPP;
1038         }
1039 
1040 out_unref:
1041         while (tsize > 0)
1042                 sd_netlink_message_unref(transaction[--tsize]);
1043 
1044         return r < 0 ? r : 0;
1045 }
1046 
fw_nftables_add_local_dnat(FirewallContext * ctx,bool add,int af,int protocol,uint16_t local_port,const union in_addr_union * remote,uint16_t remote_port,const union in_addr_union * previous_remote)1047 int fw_nftables_add_local_dnat(
1048                 FirewallContext *ctx,
1049                 bool add,
1050                 int af,
1051                 int protocol,
1052                 uint16_t local_port,
1053                 const union in_addr_union *remote,
1054                 uint16_t remote_port,
1055                 const union in_addr_union *previous_remote) {
1056 
1057         int r;
1058 
1059         if (!socket_ipv6_is_supported() && af == AF_INET6)
1060                 return -EOPNOTSUPP;
1061 
1062         r = fw_nftables_add_local_dnat_internal(ctx, add, af, protocol, local_port, remote, remote_port, previous_remote);
1063         if (r != -ENOENT)
1064                 return r;
1065 
1066         /* See comment in fw_nftables_add_masquerade(). */
1067         r = fw_nftables_init_family(ctx->nfnl, af);
1068         if (r < 0)
1069                 return r;
1070 
1071         /* table created anew; previous address already gone */
1072         return fw_nftables_add_local_dnat_internal(ctx, add, af, protocol, local_port, remote, remote_port, NULL);
1073 }
1074