1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2
3 /*
4 * AF_XDP user-space access library.
5 *
6 * Copyright(c) 2018 - 2019 Intel Corporation.
7 *
8 * Author(s): Magnus Karlsson <magnus.karlsson@intel.com>
9 */
10
11 #include <errno.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <unistd.h>
15 #include <arpa/inet.h>
16 #include <asm/barrier.h>
17 #include <linux/compiler.h>
18 #include <linux/ethtool.h>
19 #include <linux/filter.h>
20 #include <linux/if_ether.h>
21 #include <linux/if_packet.h>
22 #include <linux/if_xdp.h>
23 #include <linux/kernel.h>
24 #include <linux/list.h>
25 #include <linux/sockios.h>
26 #include <net/if.h>
27 #include <sys/ioctl.h>
28 #include <sys/mman.h>
29 #include <sys/socket.h>
30 #include <sys/types.h>
31 #include <linux/if_link.h>
32
33 #include <bpf/bpf.h>
34 #include <bpf/libbpf.h>
35 #include "xsk.h"
36
37 #ifndef SOL_XDP
38 #define SOL_XDP 283
39 #endif
40
41 #ifndef AF_XDP
42 #define AF_XDP 44
43 #endif
44
45 #ifndef PF_XDP
46 #define PF_XDP AF_XDP
47 #endif
48
49 #define pr_warn(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
50
51 enum xsk_prog {
52 XSK_PROG_FALLBACK,
53 XSK_PROG_REDIRECT_FLAGS,
54 };
55
56 struct xsk_umem {
57 struct xsk_ring_prod *fill_save;
58 struct xsk_ring_cons *comp_save;
59 char *umem_area;
60 struct xsk_umem_config config;
61 int fd;
62 int refcount;
63 struct list_head ctx_list;
64 bool rx_ring_setup_done;
65 bool tx_ring_setup_done;
66 };
67
68 struct xsk_ctx {
69 struct xsk_ring_prod *fill;
70 struct xsk_ring_cons *comp;
71 __u32 queue_id;
72 struct xsk_umem *umem;
73 int refcount;
74 int ifindex;
75 struct list_head list;
76 int prog_fd;
77 int link_fd;
78 int xsks_map_fd;
79 char ifname[IFNAMSIZ];
80 bool has_bpf_link;
81 };
82
83 struct xsk_socket {
84 struct xsk_ring_cons *rx;
85 struct xsk_ring_prod *tx;
86 __u64 outstanding_tx;
87 struct xsk_ctx *ctx;
88 struct xsk_socket_config config;
89 int fd;
90 };
91
92 struct xsk_nl_info {
93 bool xdp_prog_attached;
94 int ifindex;
95 int fd;
96 };
97
98 /* Up until and including Linux 5.3 */
99 struct xdp_ring_offset_v1 {
100 __u64 producer;
101 __u64 consumer;
102 __u64 desc;
103 };
104
105 /* Up until and including Linux 5.3 */
106 struct xdp_mmap_offsets_v1 {
107 struct xdp_ring_offset_v1 rx;
108 struct xdp_ring_offset_v1 tx;
109 struct xdp_ring_offset_v1 fr;
110 struct xdp_ring_offset_v1 cr;
111 };
112
xsk_umem__fd(const struct xsk_umem * umem)113 int xsk_umem__fd(const struct xsk_umem *umem)
114 {
115 return umem ? umem->fd : -EINVAL;
116 }
117
xsk_socket__fd(const struct xsk_socket * xsk)118 int xsk_socket__fd(const struct xsk_socket *xsk)
119 {
120 return xsk ? xsk->fd : -EINVAL;
121 }
122
xsk_page_aligned(void * buffer)123 static bool xsk_page_aligned(void *buffer)
124 {
125 unsigned long addr = (unsigned long)buffer;
126
127 return !(addr & (getpagesize() - 1));
128 }
129
xsk_set_umem_config(struct xsk_umem_config * cfg,const struct xsk_umem_config * usr_cfg)130 static void xsk_set_umem_config(struct xsk_umem_config *cfg,
131 const struct xsk_umem_config *usr_cfg)
132 {
133 if (!usr_cfg) {
134 cfg->fill_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
135 cfg->comp_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
136 cfg->frame_size = XSK_UMEM__DEFAULT_FRAME_SIZE;
137 cfg->frame_headroom = XSK_UMEM__DEFAULT_FRAME_HEADROOM;
138 cfg->flags = XSK_UMEM__DEFAULT_FLAGS;
139 return;
140 }
141
142 cfg->fill_size = usr_cfg->fill_size;
143 cfg->comp_size = usr_cfg->comp_size;
144 cfg->frame_size = usr_cfg->frame_size;
145 cfg->frame_headroom = usr_cfg->frame_headroom;
146 cfg->flags = usr_cfg->flags;
147 }
148
xsk_set_xdp_socket_config(struct xsk_socket_config * cfg,const struct xsk_socket_config * usr_cfg)149 static int xsk_set_xdp_socket_config(struct xsk_socket_config *cfg,
150 const struct xsk_socket_config *usr_cfg)
151 {
152 if (!usr_cfg) {
153 cfg->rx_size = XSK_RING_CONS__DEFAULT_NUM_DESCS;
154 cfg->tx_size = XSK_RING_PROD__DEFAULT_NUM_DESCS;
155 cfg->libbpf_flags = 0;
156 cfg->xdp_flags = 0;
157 cfg->bind_flags = 0;
158 return 0;
159 }
160
161 if (usr_cfg->libbpf_flags & ~XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)
162 return -EINVAL;
163
164 cfg->rx_size = usr_cfg->rx_size;
165 cfg->tx_size = usr_cfg->tx_size;
166 cfg->libbpf_flags = usr_cfg->libbpf_flags;
167 cfg->xdp_flags = usr_cfg->xdp_flags;
168 cfg->bind_flags = usr_cfg->bind_flags;
169
170 return 0;
171 }
172
xsk_mmap_offsets_v1(struct xdp_mmap_offsets * off)173 static void xsk_mmap_offsets_v1(struct xdp_mmap_offsets *off)
174 {
175 struct xdp_mmap_offsets_v1 off_v1;
176
177 /* getsockopt on a kernel <= 5.3 has no flags fields.
178 * Copy over the offsets to the correct places in the >=5.4 format
179 * and put the flags where they would have been on that kernel.
180 */
181 memcpy(&off_v1, off, sizeof(off_v1));
182
183 off->rx.producer = off_v1.rx.producer;
184 off->rx.consumer = off_v1.rx.consumer;
185 off->rx.desc = off_v1.rx.desc;
186 off->rx.flags = off_v1.rx.consumer + sizeof(__u32);
187
188 off->tx.producer = off_v1.tx.producer;
189 off->tx.consumer = off_v1.tx.consumer;
190 off->tx.desc = off_v1.tx.desc;
191 off->tx.flags = off_v1.tx.consumer + sizeof(__u32);
192
193 off->fr.producer = off_v1.fr.producer;
194 off->fr.consumer = off_v1.fr.consumer;
195 off->fr.desc = off_v1.fr.desc;
196 off->fr.flags = off_v1.fr.consumer + sizeof(__u32);
197
198 off->cr.producer = off_v1.cr.producer;
199 off->cr.consumer = off_v1.cr.consumer;
200 off->cr.desc = off_v1.cr.desc;
201 off->cr.flags = off_v1.cr.consumer + sizeof(__u32);
202 }
203
xsk_get_mmap_offsets(int fd,struct xdp_mmap_offsets * off)204 static int xsk_get_mmap_offsets(int fd, struct xdp_mmap_offsets *off)
205 {
206 socklen_t optlen;
207 int err;
208
209 optlen = sizeof(*off);
210 err = getsockopt(fd, SOL_XDP, XDP_MMAP_OFFSETS, off, &optlen);
211 if (err)
212 return err;
213
214 if (optlen == sizeof(*off))
215 return 0;
216
217 if (optlen == sizeof(struct xdp_mmap_offsets_v1)) {
218 xsk_mmap_offsets_v1(off);
219 return 0;
220 }
221
222 return -EINVAL;
223 }
224
xsk_create_umem_rings(struct xsk_umem * umem,int fd,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp)225 static int xsk_create_umem_rings(struct xsk_umem *umem, int fd,
226 struct xsk_ring_prod *fill,
227 struct xsk_ring_cons *comp)
228 {
229 struct xdp_mmap_offsets off;
230 void *map;
231 int err;
232
233 err = setsockopt(fd, SOL_XDP, XDP_UMEM_FILL_RING,
234 &umem->config.fill_size,
235 sizeof(umem->config.fill_size));
236 if (err)
237 return -errno;
238
239 err = setsockopt(fd, SOL_XDP, XDP_UMEM_COMPLETION_RING,
240 &umem->config.comp_size,
241 sizeof(umem->config.comp_size));
242 if (err)
243 return -errno;
244
245 err = xsk_get_mmap_offsets(fd, &off);
246 if (err)
247 return -errno;
248
249 map = mmap(NULL, off.fr.desc + umem->config.fill_size * sizeof(__u64),
250 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
251 XDP_UMEM_PGOFF_FILL_RING);
252 if (map == MAP_FAILED)
253 return -errno;
254
255 fill->mask = umem->config.fill_size - 1;
256 fill->size = umem->config.fill_size;
257 fill->producer = map + off.fr.producer;
258 fill->consumer = map + off.fr.consumer;
259 fill->flags = map + off.fr.flags;
260 fill->ring = map + off.fr.desc;
261 fill->cached_cons = umem->config.fill_size;
262
263 map = mmap(NULL, off.cr.desc + umem->config.comp_size * sizeof(__u64),
264 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
265 XDP_UMEM_PGOFF_COMPLETION_RING);
266 if (map == MAP_FAILED) {
267 err = -errno;
268 goto out_mmap;
269 }
270
271 comp->mask = umem->config.comp_size - 1;
272 comp->size = umem->config.comp_size;
273 comp->producer = map + off.cr.producer;
274 comp->consumer = map + off.cr.consumer;
275 comp->flags = map + off.cr.flags;
276 comp->ring = map + off.cr.desc;
277
278 return 0;
279
280 out_mmap:
281 munmap(map, off.fr.desc + umem->config.fill_size * sizeof(__u64));
282 return err;
283 }
284
xsk_umem__create(struct xsk_umem ** umem_ptr,void * umem_area,__u64 size,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp,const struct xsk_umem_config * usr_config)285 int xsk_umem__create(struct xsk_umem **umem_ptr, void *umem_area,
286 __u64 size, struct xsk_ring_prod *fill,
287 struct xsk_ring_cons *comp,
288 const struct xsk_umem_config *usr_config)
289 {
290 struct xdp_umem_reg mr;
291 struct xsk_umem *umem;
292 int err;
293
294 if (!umem_area || !umem_ptr || !fill || !comp)
295 return -EFAULT;
296 if (!size && !xsk_page_aligned(umem_area))
297 return -EINVAL;
298
299 umem = calloc(1, sizeof(*umem));
300 if (!umem)
301 return -ENOMEM;
302
303 umem->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
304 if (umem->fd < 0) {
305 err = -errno;
306 goto out_umem_alloc;
307 }
308
309 umem->umem_area = umem_area;
310 INIT_LIST_HEAD(&umem->ctx_list);
311 xsk_set_umem_config(&umem->config, usr_config);
312
313 memset(&mr, 0, sizeof(mr));
314 mr.addr = (uintptr_t)umem_area;
315 mr.len = size;
316 mr.chunk_size = umem->config.frame_size;
317 mr.headroom = umem->config.frame_headroom;
318 mr.flags = umem->config.flags;
319
320 err = setsockopt(umem->fd, SOL_XDP, XDP_UMEM_REG, &mr, sizeof(mr));
321 if (err) {
322 err = -errno;
323 goto out_socket;
324 }
325
326 err = xsk_create_umem_rings(umem, umem->fd, fill, comp);
327 if (err)
328 goto out_socket;
329
330 umem->fill_save = fill;
331 umem->comp_save = comp;
332 *umem_ptr = umem;
333 return 0;
334
335 out_socket:
336 close(umem->fd);
337 out_umem_alloc:
338 free(umem);
339 return err;
340 }
341
342 struct xsk_umem_config_v1 {
343 __u32 fill_size;
344 __u32 comp_size;
345 __u32 frame_size;
346 __u32 frame_headroom;
347 };
348
get_xsk_prog(void)349 static enum xsk_prog get_xsk_prog(void)
350 {
351 enum xsk_prog detected = XSK_PROG_FALLBACK;
352 char data_in = 0, data_out;
353 struct bpf_insn insns[] = {
354 BPF_LD_MAP_FD(BPF_REG_1, 0),
355 BPF_MOV64_IMM(BPF_REG_2, 0),
356 BPF_MOV64_IMM(BPF_REG_3, XDP_PASS),
357 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
358 BPF_EXIT_INSN(),
359 };
360 LIBBPF_OPTS(bpf_test_run_opts, opts,
361 .data_in = &data_in,
362 .data_size_in = 1,
363 .data_out = &data_out,
364 );
365
366 int prog_fd, map_fd, ret, insn_cnt = ARRAY_SIZE(insns);
367
368 map_fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, NULL, sizeof(int), sizeof(int), 1, NULL);
369 if (map_fd < 0)
370 return detected;
371
372 insns[0].imm = map_fd;
373
374 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
375 if (prog_fd < 0) {
376 close(map_fd);
377 return detected;
378 }
379
380 ret = bpf_prog_test_run_opts(prog_fd, &opts);
381 if (!ret && opts.retval == XDP_PASS)
382 detected = XSK_PROG_REDIRECT_FLAGS;
383 close(prog_fd);
384 close(map_fd);
385 return detected;
386 }
387
xsk_load_xdp_prog(struct xsk_socket * xsk)388 static int xsk_load_xdp_prog(struct xsk_socket *xsk)
389 {
390 static const int log_buf_size = 16 * 1024;
391 struct xsk_ctx *ctx = xsk->ctx;
392 char log_buf[log_buf_size];
393 int prog_fd;
394
395 /* This is the fallback C-program:
396 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
397 * {
398 * int ret, index = ctx->rx_queue_index;
399 *
400 * // A set entry here means that the correspnding queue_id
401 * // has an active AF_XDP socket bound to it.
402 * ret = bpf_redirect_map(&xsks_map, index, XDP_PASS);
403 * if (ret > 0)
404 * return ret;
405 *
406 * // Fallback for pre-5.3 kernels, not supporting default
407 * // action in the flags parameter.
408 * if (bpf_map_lookup_elem(&xsks_map, &index))
409 * return bpf_redirect_map(&xsks_map, index, 0);
410 * return XDP_PASS;
411 * }
412 */
413 struct bpf_insn prog[] = {
414 /* r2 = *(u32 *)(r1 + 16) */
415 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
416 /* *(u32 *)(r10 - 4) = r2 */
417 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -4),
418 /* r1 = xskmap[] */
419 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
420 /* r3 = XDP_PASS */
421 BPF_MOV64_IMM(BPF_REG_3, 2),
422 /* call bpf_redirect_map */
423 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
424 /* if w0 != 0 goto pc+13 */
425 BPF_JMP32_IMM(BPF_JSGT, BPF_REG_0, 0, 13),
426 /* r2 = r10 */
427 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
428 /* r2 += -4 */
429 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4),
430 /* r1 = xskmap[] */
431 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
432 /* call bpf_map_lookup_elem */
433 BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem),
434 /* r1 = r0 */
435 BPF_MOV64_REG(BPF_REG_1, BPF_REG_0),
436 /* r0 = XDP_PASS */
437 BPF_MOV64_IMM(BPF_REG_0, 2),
438 /* if r1 == 0 goto pc+5 */
439 BPF_JMP_IMM(BPF_JEQ, BPF_REG_1, 0, 5),
440 /* r2 = *(u32 *)(r10 - 4) */
441 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_10, -4),
442 /* r1 = xskmap[] */
443 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
444 /* r3 = 0 */
445 BPF_MOV64_IMM(BPF_REG_3, 0),
446 /* call bpf_redirect_map */
447 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
448 /* The jumps are to this instruction */
449 BPF_EXIT_INSN(),
450 };
451
452 /* This is the post-5.3 kernel C-program:
453 * SEC("xdp_sock") int xdp_sock_prog(struct xdp_md *ctx)
454 * {
455 * return bpf_redirect_map(&xsks_map, ctx->rx_queue_index, XDP_PASS);
456 * }
457 */
458 struct bpf_insn prog_redirect_flags[] = {
459 /* r2 = *(u32 *)(r1 + 16) */
460 BPF_LDX_MEM(BPF_W, BPF_REG_2, BPF_REG_1, 16),
461 /* r1 = xskmap[] */
462 BPF_LD_MAP_FD(BPF_REG_1, ctx->xsks_map_fd),
463 /* r3 = XDP_PASS */
464 BPF_MOV64_IMM(BPF_REG_3, 2),
465 /* call bpf_redirect_map */
466 BPF_EMIT_CALL(BPF_FUNC_redirect_map),
467 BPF_EXIT_INSN(),
468 };
469 size_t insns_cnt[] = {ARRAY_SIZE(prog),
470 ARRAY_SIZE(prog_redirect_flags),
471 };
472 struct bpf_insn *progs[] = {prog, prog_redirect_flags};
473 enum xsk_prog option = get_xsk_prog();
474 LIBBPF_OPTS(bpf_prog_load_opts, opts,
475 .log_buf = log_buf,
476 .log_size = log_buf_size,
477 );
478
479 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "LGPL-2.1 or BSD-2-Clause",
480 progs[option], insns_cnt[option], &opts);
481 if (prog_fd < 0) {
482 pr_warn("BPF log buffer:\n%s", log_buf);
483 return prog_fd;
484 }
485
486 ctx->prog_fd = prog_fd;
487 return 0;
488 }
489
xsk_create_bpf_link(struct xsk_socket * xsk)490 static int xsk_create_bpf_link(struct xsk_socket *xsk)
491 {
492 DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts);
493 struct xsk_ctx *ctx = xsk->ctx;
494 __u32 prog_id = 0;
495 int link_fd;
496 int err;
497
498 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
499 if (err) {
500 pr_warn("getting XDP prog id failed\n");
501 return err;
502 }
503
504 /* if there's a netlink-based XDP prog loaded on interface, bail out
505 * and ask user to do the removal by himself
506 */
507 if (prog_id) {
508 pr_warn("Netlink-based XDP prog detected, please unload it in order to launch AF_XDP prog\n");
509 return -EINVAL;
510 }
511
512 opts.flags = xsk->config.xdp_flags & ~(XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_REPLACE);
513
514 link_fd = bpf_link_create(ctx->prog_fd, ctx->ifindex, BPF_XDP, &opts);
515 if (link_fd < 0) {
516 pr_warn("bpf_link_create failed: %s\n", strerror(errno));
517 return link_fd;
518 }
519
520 ctx->link_fd = link_fd;
521 return 0;
522 }
523
524 /* Copy up to sz - 1 bytes from zero-terminated src string and ensure that dst
525 * is zero-terminated string no matter what (unless sz == 0, in which case
526 * it's a no-op). It's conceptually close to FreeBSD's strlcpy(), but differs
527 * in what is returned. Given this is internal helper, it's trivial to extend
528 * this, when necessary. Use this instead of strncpy inside libbpf source code.
529 */
libbpf_strlcpy(char * dst,const char * src,size_t sz)530 static inline void libbpf_strlcpy(char *dst, const char *src, size_t sz)
531 {
532 size_t i;
533
534 if (sz == 0)
535 return;
536
537 sz--;
538 for (i = 0; i < sz && src[i]; i++)
539 dst[i] = src[i];
540 dst[i] = '\0';
541 }
542
xsk_get_max_queues(struct xsk_socket * xsk)543 static int xsk_get_max_queues(struct xsk_socket *xsk)
544 {
545 struct ethtool_channels channels = { .cmd = ETHTOOL_GCHANNELS };
546 struct xsk_ctx *ctx = xsk->ctx;
547 struct ifreq ifr = {};
548 int fd, err, ret;
549
550 fd = socket(AF_LOCAL, SOCK_DGRAM | SOCK_CLOEXEC, 0);
551 if (fd < 0)
552 return -errno;
553
554 ifr.ifr_data = (void *)&channels;
555 libbpf_strlcpy(ifr.ifr_name, ctx->ifname, IFNAMSIZ);
556 err = ioctl(fd, SIOCETHTOOL, &ifr);
557 if (err && errno != EOPNOTSUPP) {
558 ret = -errno;
559 goto out;
560 }
561
562 if (err) {
563 /* If the device says it has no channels, then all traffic
564 * is sent to a single stream, so max queues = 1.
565 */
566 ret = 1;
567 } else {
568 /* Take the max of rx, tx, combined. Drivers return
569 * the number of channels in different ways.
570 */
571 ret = max(channels.max_rx, channels.max_tx);
572 ret = max(ret, (int)channels.max_combined);
573 }
574
575 out:
576 close(fd);
577 return ret;
578 }
579
xsk_create_bpf_maps(struct xsk_socket * xsk)580 static int xsk_create_bpf_maps(struct xsk_socket *xsk)
581 {
582 struct xsk_ctx *ctx = xsk->ctx;
583 int max_queues;
584 int fd;
585
586 max_queues = xsk_get_max_queues(xsk);
587 if (max_queues < 0)
588 return max_queues;
589
590 fd = bpf_map_create(BPF_MAP_TYPE_XSKMAP, "xsks_map",
591 sizeof(int), sizeof(int), max_queues, NULL);
592 if (fd < 0)
593 return fd;
594
595 ctx->xsks_map_fd = fd;
596
597 return 0;
598 }
599
xsk_delete_bpf_maps(struct xsk_socket * xsk)600 static void xsk_delete_bpf_maps(struct xsk_socket *xsk)
601 {
602 struct xsk_ctx *ctx = xsk->ctx;
603
604 bpf_map_delete_elem(ctx->xsks_map_fd, &ctx->queue_id);
605 close(ctx->xsks_map_fd);
606 }
607
xsk_lookup_bpf_maps(struct xsk_socket * xsk)608 static int xsk_lookup_bpf_maps(struct xsk_socket *xsk)
609 {
610 __u32 i, *map_ids, num_maps, prog_len = sizeof(struct bpf_prog_info);
611 __u32 map_len = sizeof(struct bpf_map_info);
612 struct bpf_prog_info prog_info = {};
613 struct xsk_ctx *ctx = xsk->ctx;
614 struct bpf_map_info map_info;
615 int fd, err;
616
617 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
618 if (err)
619 return err;
620
621 num_maps = prog_info.nr_map_ids;
622
623 map_ids = calloc(prog_info.nr_map_ids, sizeof(*map_ids));
624 if (!map_ids)
625 return -ENOMEM;
626
627 memset(&prog_info, 0, prog_len);
628 prog_info.nr_map_ids = num_maps;
629 prog_info.map_ids = (__u64)(unsigned long)map_ids;
630
631 err = bpf_obj_get_info_by_fd(ctx->prog_fd, &prog_info, &prog_len);
632 if (err)
633 goto out_map_ids;
634
635 ctx->xsks_map_fd = -1;
636
637 for (i = 0; i < prog_info.nr_map_ids; i++) {
638 fd = bpf_map_get_fd_by_id(map_ids[i]);
639 if (fd < 0)
640 continue;
641
642 memset(&map_info, 0, map_len);
643 err = bpf_obj_get_info_by_fd(fd, &map_info, &map_len);
644 if (err) {
645 close(fd);
646 continue;
647 }
648
649 if (!strncmp(map_info.name, "xsks_map", sizeof(map_info.name))) {
650 ctx->xsks_map_fd = fd;
651 break;
652 }
653
654 close(fd);
655 }
656
657 if (ctx->xsks_map_fd == -1)
658 err = -ENOENT;
659
660 out_map_ids:
661 free(map_ids);
662 return err;
663 }
664
xsk_set_bpf_maps(struct xsk_socket * xsk)665 static int xsk_set_bpf_maps(struct xsk_socket *xsk)
666 {
667 struct xsk_ctx *ctx = xsk->ctx;
668
669 return bpf_map_update_elem(ctx->xsks_map_fd, &ctx->queue_id,
670 &xsk->fd, 0);
671 }
672
xsk_link_lookup(int ifindex,__u32 * prog_id,int * link_fd)673 static int xsk_link_lookup(int ifindex, __u32 *prog_id, int *link_fd)
674 {
675 struct bpf_link_info link_info;
676 __u32 link_len;
677 __u32 id = 0;
678 int err;
679 int fd;
680
681 while (true) {
682 err = bpf_link_get_next_id(id, &id);
683 if (err) {
684 if (errno == ENOENT) {
685 err = 0;
686 break;
687 }
688 pr_warn("can't get next link: %s\n", strerror(errno));
689 break;
690 }
691
692 fd = bpf_link_get_fd_by_id(id);
693 if (fd < 0) {
694 if (errno == ENOENT)
695 continue;
696 pr_warn("can't get link by id (%u): %s\n", id, strerror(errno));
697 err = -errno;
698 break;
699 }
700
701 link_len = sizeof(struct bpf_link_info);
702 memset(&link_info, 0, link_len);
703 err = bpf_obj_get_info_by_fd(fd, &link_info, &link_len);
704 if (err) {
705 pr_warn("can't get link info: %s\n", strerror(errno));
706 close(fd);
707 break;
708 }
709 if (link_info.type == BPF_LINK_TYPE_XDP) {
710 if (link_info.xdp.ifindex == ifindex) {
711 *link_fd = fd;
712 if (prog_id)
713 *prog_id = link_info.prog_id;
714 break;
715 }
716 }
717 close(fd);
718 }
719
720 return err;
721 }
722
xsk_probe_bpf_link(void)723 static bool xsk_probe_bpf_link(void)
724 {
725 LIBBPF_OPTS(bpf_link_create_opts, opts, .flags = XDP_FLAGS_SKB_MODE);
726 struct bpf_insn insns[2] = {
727 BPF_MOV64_IMM(BPF_REG_0, XDP_PASS),
728 BPF_EXIT_INSN()
729 };
730 int prog_fd, link_fd = -1, insn_cnt = ARRAY_SIZE(insns);
731 int ifindex_lo = 1;
732 bool ret = false;
733 int err;
734
735 err = xsk_link_lookup(ifindex_lo, NULL, &link_fd);
736 if (err)
737 return ret;
738
739 if (link_fd >= 0)
740 return true;
741
742 prog_fd = bpf_prog_load(BPF_PROG_TYPE_XDP, NULL, "GPL", insns, insn_cnt, NULL);
743 if (prog_fd < 0)
744 return ret;
745
746 link_fd = bpf_link_create(prog_fd, ifindex_lo, BPF_XDP, &opts);
747 close(prog_fd);
748
749 if (link_fd >= 0) {
750 ret = true;
751 close(link_fd);
752 }
753
754 return ret;
755 }
756
xsk_create_xsk_struct(int ifindex,struct xsk_socket * xsk)757 static int xsk_create_xsk_struct(int ifindex, struct xsk_socket *xsk)
758 {
759 char ifname[IFNAMSIZ];
760 struct xsk_ctx *ctx;
761 char *interface;
762
763 ctx = calloc(1, sizeof(*ctx));
764 if (!ctx)
765 return -ENOMEM;
766
767 interface = if_indextoname(ifindex, &ifname[0]);
768 if (!interface) {
769 free(ctx);
770 return -errno;
771 }
772
773 ctx->ifindex = ifindex;
774 libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
775
776 xsk->ctx = ctx;
777 xsk->ctx->has_bpf_link = xsk_probe_bpf_link();
778
779 return 0;
780 }
781
xsk_init_xdp_res(struct xsk_socket * xsk,int * xsks_map_fd)782 static int xsk_init_xdp_res(struct xsk_socket *xsk,
783 int *xsks_map_fd)
784 {
785 struct xsk_ctx *ctx = xsk->ctx;
786 int err;
787
788 err = xsk_create_bpf_maps(xsk);
789 if (err)
790 return err;
791
792 err = xsk_load_xdp_prog(xsk);
793 if (err)
794 goto err_load_xdp_prog;
795
796 if (ctx->has_bpf_link)
797 err = xsk_create_bpf_link(xsk);
798 else
799 err = bpf_xdp_attach(xsk->ctx->ifindex, ctx->prog_fd,
800 xsk->config.xdp_flags, NULL);
801
802 if (err)
803 goto err_attach_xdp_prog;
804
805 if (!xsk->rx)
806 return err;
807
808 err = xsk_set_bpf_maps(xsk);
809 if (err)
810 goto err_set_bpf_maps;
811
812 return err;
813
814 err_set_bpf_maps:
815 if (ctx->has_bpf_link)
816 close(ctx->link_fd);
817 else
818 bpf_xdp_detach(ctx->ifindex, 0, NULL);
819 err_attach_xdp_prog:
820 close(ctx->prog_fd);
821 err_load_xdp_prog:
822 xsk_delete_bpf_maps(xsk);
823 return err;
824 }
825
xsk_lookup_xdp_res(struct xsk_socket * xsk,int * xsks_map_fd,int prog_id)826 static int xsk_lookup_xdp_res(struct xsk_socket *xsk, int *xsks_map_fd, int prog_id)
827 {
828 struct xsk_ctx *ctx = xsk->ctx;
829 int err;
830
831 ctx->prog_fd = bpf_prog_get_fd_by_id(prog_id);
832 if (ctx->prog_fd < 0) {
833 err = -errno;
834 goto err_prog_fd;
835 }
836 err = xsk_lookup_bpf_maps(xsk);
837 if (err)
838 goto err_lookup_maps;
839
840 if (!xsk->rx)
841 return err;
842
843 err = xsk_set_bpf_maps(xsk);
844 if (err)
845 goto err_set_maps;
846
847 return err;
848
849 err_set_maps:
850 close(ctx->xsks_map_fd);
851 err_lookup_maps:
852 close(ctx->prog_fd);
853 err_prog_fd:
854 if (ctx->has_bpf_link)
855 close(ctx->link_fd);
856 return err;
857 }
858
__xsk_setup_xdp_prog(struct xsk_socket * _xdp,int * xsks_map_fd)859 static int __xsk_setup_xdp_prog(struct xsk_socket *_xdp, int *xsks_map_fd)
860 {
861 struct xsk_socket *xsk = _xdp;
862 struct xsk_ctx *ctx = xsk->ctx;
863 __u32 prog_id = 0;
864 int err;
865
866 if (ctx->has_bpf_link)
867 err = xsk_link_lookup(ctx->ifindex, &prog_id, &ctx->link_fd);
868 else
869 err = bpf_xdp_query_id(ctx->ifindex, xsk->config.xdp_flags, &prog_id);
870
871 if (err)
872 return err;
873
874 err = !prog_id ? xsk_init_xdp_res(xsk, xsks_map_fd) :
875 xsk_lookup_xdp_res(xsk, xsks_map_fd, prog_id);
876
877 if (!err && xsks_map_fd)
878 *xsks_map_fd = ctx->xsks_map_fd;
879
880 return err;
881 }
882
xsk_setup_xdp_prog_xsk(struct xsk_socket * xsk,int * xsks_map_fd)883 int xsk_setup_xdp_prog_xsk(struct xsk_socket *xsk, int *xsks_map_fd)
884 {
885 return __xsk_setup_xdp_prog(xsk, xsks_map_fd);
886 }
887
xsk_get_ctx(struct xsk_umem * umem,int ifindex,__u32 queue_id)888 static struct xsk_ctx *xsk_get_ctx(struct xsk_umem *umem, int ifindex,
889 __u32 queue_id)
890 {
891 struct xsk_ctx *ctx;
892
893 if (list_empty(&umem->ctx_list))
894 return NULL;
895
896 list_for_each_entry(ctx, &umem->ctx_list, list) {
897 if (ctx->ifindex == ifindex && ctx->queue_id == queue_id) {
898 ctx->refcount++;
899 return ctx;
900 }
901 }
902
903 return NULL;
904 }
905
xsk_put_ctx(struct xsk_ctx * ctx,bool unmap)906 static void xsk_put_ctx(struct xsk_ctx *ctx, bool unmap)
907 {
908 struct xsk_umem *umem = ctx->umem;
909 struct xdp_mmap_offsets off;
910 int err;
911
912 if (--ctx->refcount)
913 return;
914
915 if (!unmap)
916 goto out_free;
917
918 err = xsk_get_mmap_offsets(umem->fd, &off);
919 if (err)
920 goto out_free;
921
922 munmap(ctx->fill->ring - off.fr.desc, off.fr.desc + umem->config.fill_size *
923 sizeof(__u64));
924 munmap(ctx->comp->ring - off.cr.desc, off.cr.desc + umem->config.comp_size *
925 sizeof(__u64));
926
927 out_free:
928 list_del(&ctx->list);
929 free(ctx);
930 }
931
xsk_create_ctx(struct xsk_socket * xsk,struct xsk_umem * umem,int ifindex,const char * ifname,__u32 queue_id,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp)932 static struct xsk_ctx *xsk_create_ctx(struct xsk_socket *xsk,
933 struct xsk_umem *umem, int ifindex,
934 const char *ifname, __u32 queue_id,
935 struct xsk_ring_prod *fill,
936 struct xsk_ring_cons *comp)
937 {
938 struct xsk_ctx *ctx;
939 int err;
940
941 ctx = calloc(1, sizeof(*ctx));
942 if (!ctx)
943 return NULL;
944
945 if (!umem->fill_save) {
946 err = xsk_create_umem_rings(umem, xsk->fd, fill, comp);
947 if (err) {
948 free(ctx);
949 return NULL;
950 }
951 } else if (umem->fill_save != fill || umem->comp_save != comp) {
952 /* Copy over rings to new structs. */
953 memcpy(fill, umem->fill_save, sizeof(*fill));
954 memcpy(comp, umem->comp_save, sizeof(*comp));
955 }
956
957 ctx->ifindex = ifindex;
958 ctx->refcount = 1;
959 ctx->umem = umem;
960 ctx->queue_id = queue_id;
961 libbpf_strlcpy(ctx->ifname, ifname, IFNAMSIZ);
962
963 ctx->fill = fill;
964 ctx->comp = comp;
965 list_add(&ctx->list, &umem->ctx_list);
966 ctx->has_bpf_link = xsk_probe_bpf_link();
967 return ctx;
968 }
969
xsk_destroy_xsk_struct(struct xsk_socket * xsk)970 static void xsk_destroy_xsk_struct(struct xsk_socket *xsk)
971 {
972 free(xsk->ctx);
973 free(xsk);
974 }
975
xsk_socket__update_xskmap(struct xsk_socket * xsk,int fd)976 int xsk_socket__update_xskmap(struct xsk_socket *xsk, int fd)
977 {
978 xsk->ctx->xsks_map_fd = fd;
979 return xsk_set_bpf_maps(xsk);
980 }
981
xsk_setup_xdp_prog(int ifindex,int * xsks_map_fd)982 int xsk_setup_xdp_prog(int ifindex, int *xsks_map_fd)
983 {
984 struct xsk_socket *xsk;
985 int res;
986
987 xsk = calloc(1, sizeof(*xsk));
988 if (!xsk)
989 return -ENOMEM;
990
991 res = xsk_create_xsk_struct(ifindex, xsk);
992 if (res) {
993 free(xsk);
994 return -EINVAL;
995 }
996
997 res = __xsk_setup_xdp_prog(xsk, xsks_map_fd);
998
999 xsk_destroy_xsk_struct(xsk);
1000
1001 return res;
1002 }
1003
xsk_socket__create_shared(struct xsk_socket ** xsk_ptr,const char * ifname,__u32 queue_id,struct xsk_umem * umem,struct xsk_ring_cons * rx,struct xsk_ring_prod * tx,struct xsk_ring_prod * fill,struct xsk_ring_cons * comp,const struct xsk_socket_config * usr_config)1004 int xsk_socket__create_shared(struct xsk_socket **xsk_ptr,
1005 const char *ifname,
1006 __u32 queue_id, struct xsk_umem *umem,
1007 struct xsk_ring_cons *rx,
1008 struct xsk_ring_prod *tx,
1009 struct xsk_ring_prod *fill,
1010 struct xsk_ring_cons *comp,
1011 const struct xsk_socket_config *usr_config)
1012 {
1013 bool unmap, rx_setup_done = false, tx_setup_done = false;
1014 void *rx_map = NULL, *tx_map = NULL;
1015 struct sockaddr_xdp sxdp = {};
1016 struct xdp_mmap_offsets off;
1017 struct xsk_socket *xsk;
1018 struct xsk_ctx *ctx;
1019 int err, ifindex;
1020
1021 if (!umem || !xsk_ptr || !(rx || tx))
1022 return -EFAULT;
1023
1024 unmap = umem->fill_save != fill;
1025
1026 xsk = calloc(1, sizeof(*xsk));
1027 if (!xsk)
1028 return -ENOMEM;
1029
1030 err = xsk_set_xdp_socket_config(&xsk->config, usr_config);
1031 if (err)
1032 goto out_xsk_alloc;
1033
1034 xsk->outstanding_tx = 0;
1035 ifindex = if_nametoindex(ifname);
1036 if (!ifindex) {
1037 err = -errno;
1038 goto out_xsk_alloc;
1039 }
1040
1041 if (umem->refcount++ > 0) {
1042 xsk->fd = socket(AF_XDP, SOCK_RAW | SOCK_CLOEXEC, 0);
1043 if (xsk->fd < 0) {
1044 err = -errno;
1045 goto out_xsk_alloc;
1046 }
1047 } else {
1048 xsk->fd = umem->fd;
1049 rx_setup_done = umem->rx_ring_setup_done;
1050 tx_setup_done = umem->tx_ring_setup_done;
1051 }
1052
1053 ctx = xsk_get_ctx(umem, ifindex, queue_id);
1054 if (!ctx) {
1055 if (!fill || !comp) {
1056 err = -EFAULT;
1057 goto out_socket;
1058 }
1059
1060 ctx = xsk_create_ctx(xsk, umem, ifindex, ifname, queue_id,
1061 fill, comp);
1062 if (!ctx) {
1063 err = -ENOMEM;
1064 goto out_socket;
1065 }
1066 }
1067 xsk->ctx = ctx;
1068
1069 if (rx && !rx_setup_done) {
1070 err = setsockopt(xsk->fd, SOL_XDP, XDP_RX_RING,
1071 &xsk->config.rx_size,
1072 sizeof(xsk->config.rx_size));
1073 if (err) {
1074 err = -errno;
1075 goto out_put_ctx;
1076 }
1077 if (xsk->fd == umem->fd)
1078 umem->rx_ring_setup_done = true;
1079 }
1080 if (tx && !tx_setup_done) {
1081 err = setsockopt(xsk->fd, SOL_XDP, XDP_TX_RING,
1082 &xsk->config.tx_size,
1083 sizeof(xsk->config.tx_size));
1084 if (err) {
1085 err = -errno;
1086 goto out_put_ctx;
1087 }
1088 if (xsk->fd == umem->fd)
1089 umem->tx_ring_setup_done = true;
1090 }
1091
1092 err = xsk_get_mmap_offsets(xsk->fd, &off);
1093 if (err) {
1094 err = -errno;
1095 goto out_put_ctx;
1096 }
1097
1098 if (rx) {
1099 rx_map = mmap(NULL, off.rx.desc +
1100 xsk->config.rx_size * sizeof(struct xdp_desc),
1101 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1102 xsk->fd, XDP_PGOFF_RX_RING);
1103 if (rx_map == MAP_FAILED) {
1104 err = -errno;
1105 goto out_put_ctx;
1106 }
1107
1108 rx->mask = xsk->config.rx_size - 1;
1109 rx->size = xsk->config.rx_size;
1110 rx->producer = rx_map + off.rx.producer;
1111 rx->consumer = rx_map + off.rx.consumer;
1112 rx->flags = rx_map + off.rx.flags;
1113 rx->ring = rx_map + off.rx.desc;
1114 rx->cached_prod = *rx->producer;
1115 rx->cached_cons = *rx->consumer;
1116 }
1117 xsk->rx = rx;
1118
1119 if (tx) {
1120 tx_map = mmap(NULL, off.tx.desc +
1121 xsk->config.tx_size * sizeof(struct xdp_desc),
1122 PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
1123 xsk->fd, XDP_PGOFF_TX_RING);
1124 if (tx_map == MAP_FAILED) {
1125 err = -errno;
1126 goto out_mmap_rx;
1127 }
1128
1129 tx->mask = xsk->config.tx_size - 1;
1130 tx->size = xsk->config.tx_size;
1131 tx->producer = tx_map + off.tx.producer;
1132 tx->consumer = tx_map + off.tx.consumer;
1133 tx->flags = tx_map + off.tx.flags;
1134 tx->ring = tx_map + off.tx.desc;
1135 tx->cached_prod = *tx->producer;
1136 /* cached_cons is r->size bigger than the real consumer pointer
1137 * See xsk_prod_nb_free
1138 */
1139 tx->cached_cons = *tx->consumer + xsk->config.tx_size;
1140 }
1141 xsk->tx = tx;
1142
1143 sxdp.sxdp_family = PF_XDP;
1144 sxdp.sxdp_ifindex = ctx->ifindex;
1145 sxdp.sxdp_queue_id = ctx->queue_id;
1146 if (umem->refcount > 1) {
1147 sxdp.sxdp_flags |= XDP_SHARED_UMEM;
1148 sxdp.sxdp_shared_umem_fd = umem->fd;
1149 } else {
1150 sxdp.sxdp_flags = xsk->config.bind_flags;
1151 }
1152
1153 err = bind(xsk->fd, (struct sockaddr *)&sxdp, sizeof(sxdp));
1154 if (err) {
1155 err = -errno;
1156 goto out_mmap_tx;
1157 }
1158
1159 if (!(xsk->config.libbpf_flags & XSK_LIBBPF_FLAGS__INHIBIT_PROG_LOAD)) {
1160 err = __xsk_setup_xdp_prog(xsk, NULL);
1161 if (err)
1162 goto out_mmap_tx;
1163 }
1164
1165 *xsk_ptr = xsk;
1166 umem->fill_save = NULL;
1167 umem->comp_save = NULL;
1168 return 0;
1169
1170 out_mmap_tx:
1171 if (tx)
1172 munmap(tx_map, off.tx.desc +
1173 xsk->config.tx_size * sizeof(struct xdp_desc));
1174 out_mmap_rx:
1175 if (rx)
1176 munmap(rx_map, off.rx.desc +
1177 xsk->config.rx_size * sizeof(struct xdp_desc));
1178 out_put_ctx:
1179 xsk_put_ctx(ctx, unmap);
1180 out_socket:
1181 if (--umem->refcount)
1182 close(xsk->fd);
1183 out_xsk_alloc:
1184 free(xsk);
1185 return err;
1186 }
1187
xsk_socket__create(struct xsk_socket ** xsk_ptr,const char * ifname,__u32 queue_id,struct xsk_umem * umem,struct xsk_ring_cons * rx,struct xsk_ring_prod * tx,const struct xsk_socket_config * usr_config)1188 int xsk_socket__create(struct xsk_socket **xsk_ptr, const char *ifname,
1189 __u32 queue_id, struct xsk_umem *umem,
1190 struct xsk_ring_cons *rx, struct xsk_ring_prod *tx,
1191 const struct xsk_socket_config *usr_config)
1192 {
1193 if (!umem)
1194 return -EFAULT;
1195
1196 return xsk_socket__create_shared(xsk_ptr, ifname, queue_id, umem,
1197 rx, tx, umem->fill_save,
1198 umem->comp_save, usr_config);
1199 }
1200
xsk_umem__delete(struct xsk_umem * umem)1201 int xsk_umem__delete(struct xsk_umem *umem)
1202 {
1203 struct xdp_mmap_offsets off;
1204 int err;
1205
1206 if (!umem)
1207 return 0;
1208
1209 if (umem->refcount)
1210 return -EBUSY;
1211
1212 err = xsk_get_mmap_offsets(umem->fd, &off);
1213 if (!err && umem->fill_save && umem->comp_save) {
1214 munmap(umem->fill_save->ring - off.fr.desc,
1215 off.fr.desc + umem->config.fill_size * sizeof(__u64));
1216 munmap(umem->comp_save->ring - off.cr.desc,
1217 off.cr.desc + umem->config.comp_size * sizeof(__u64));
1218 }
1219
1220 close(umem->fd);
1221 free(umem);
1222
1223 return 0;
1224 }
1225
xsk_socket__delete(struct xsk_socket * xsk)1226 void xsk_socket__delete(struct xsk_socket *xsk)
1227 {
1228 size_t desc_sz = sizeof(struct xdp_desc);
1229 struct xdp_mmap_offsets off;
1230 struct xsk_umem *umem;
1231 struct xsk_ctx *ctx;
1232 int err;
1233
1234 if (!xsk)
1235 return;
1236
1237 ctx = xsk->ctx;
1238 umem = ctx->umem;
1239
1240 if (ctx->refcount == 1) {
1241 xsk_delete_bpf_maps(xsk);
1242 close(ctx->prog_fd);
1243 if (ctx->has_bpf_link)
1244 close(ctx->link_fd);
1245 }
1246
1247 xsk_put_ctx(ctx, true);
1248
1249 err = xsk_get_mmap_offsets(xsk->fd, &off);
1250 if (!err) {
1251 if (xsk->rx) {
1252 munmap(xsk->rx->ring - off.rx.desc,
1253 off.rx.desc + xsk->config.rx_size * desc_sz);
1254 }
1255 if (xsk->tx) {
1256 munmap(xsk->tx->ring - off.tx.desc,
1257 off.tx.desc + xsk->config.tx_size * desc_sz);
1258 }
1259 }
1260
1261 umem->refcount--;
1262 /* Do not close an fd that also has an associated umem connected
1263 * to it.
1264 */
1265 if (xsk->fd != umem->fd)
1266 close(xsk->fd);
1267 free(xsk);
1268 }
1269