1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 // Copyright (c) 2019, 2020 Cloudflare
3 
4 #include <stdbool.h>
5 #include <stddef.h>
6 #include <stdint.h>
7 #include <string.h>
8 
9 #include <linux/bpf.h>
10 #include <linux/icmp.h>
11 #include <linux/icmpv6.h>
12 #include <linux/if_ether.h>
13 #include <linux/in.h>
14 #include <linux/ip.h>
15 #include <linux/ipv6.h>
16 #include <linux/pkt_cls.h>
17 #include <linux/tcp.h>
18 #include <linux/udp.h>
19 
20 #include <bpf/bpf_helpers.h>
21 #include <bpf/bpf_endian.h>
22 
23 #include "test_cls_redirect.h"
24 
25 #ifdef SUBPROGS
26 #define INLINING __noinline
27 #else
28 #define INLINING __always_inline
29 #endif
30 
31 #define offsetofend(TYPE, MEMBER) \
32 	(offsetof(TYPE, MEMBER) + sizeof((((TYPE *)0)->MEMBER)))
33 
34 #define IP_OFFSET_MASK (0x1FFF)
35 #define IP_MF (0x2000)
36 
37 char _license[] SEC("license") = "Dual BSD/GPL";
38 
39 /**
40  * Destination port and IP used for UDP encapsulation.
41  */
42 volatile const __be16 ENCAPSULATION_PORT;
43 volatile const __be32 ENCAPSULATION_IP;
44 
45 typedef struct {
46 	uint64_t processed_packets_total;
47 	uint64_t l3_protocol_packets_total_ipv4;
48 	uint64_t l3_protocol_packets_total_ipv6;
49 	uint64_t l4_protocol_packets_total_tcp;
50 	uint64_t l4_protocol_packets_total_udp;
51 	uint64_t accepted_packets_total_syn;
52 	uint64_t accepted_packets_total_syn_cookies;
53 	uint64_t accepted_packets_total_last_hop;
54 	uint64_t accepted_packets_total_icmp_echo_request;
55 	uint64_t accepted_packets_total_established;
56 	uint64_t forwarded_packets_total_gue;
57 	uint64_t forwarded_packets_total_gre;
58 
59 	uint64_t errors_total_unknown_l3_proto;
60 	uint64_t errors_total_unknown_l4_proto;
61 	uint64_t errors_total_malformed_ip;
62 	uint64_t errors_total_fragmented_ip;
63 	uint64_t errors_total_malformed_icmp;
64 	uint64_t errors_total_unwanted_icmp;
65 	uint64_t errors_total_malformed_icmp_pkt_too_big;
66 	uint64_t errors_total_malformed_tcp;
67 	uint64_t errors_total_malformed_udp;
68 	uint64_t errors_total_icmp_echo_replies;
69 	uint64_t errors_total_malformed_encapsulation;
70 	uint64_t errors_total_encap_adjust_failed;
71 	uint64_t errors_total_encap_buffer_too_small;
72 	uint64_t errors_total_redirect_loop;
73 	uint64_t errors_total_encap_mtu_violate;
74 } metrics_t;
75 
76 typedef enum {
77 	INVALID = 0,
78 	UNKNOWN,
79 	ECHO_REQUEST,
80 	SYN,
81 	SYN_COOKIE,
82 	ESTABLISHED,
83 } verdict_t;
84 
85 typedef struct {
86 	uint16_t src, dst;
87 } flow_ports_t;
88 
89 _Static_assert(
90 	sizeof(flow_ports_t) !=
91 		offsetofend(struct bpf_sock_tuple, ipv4.dport) -
92 			offsetof(struct bpf_sock_tuple, ipv4.sport) - 1,
93 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
94 _Static_assert(
95 	sizeof(flow_ports_t) !=
96 		offsetofend(struct bpf_sock_tuple, ipv6.dport) -
97 			offsetof(struct bpf_sock_tuple, ipv6.sport) - 1,
98 	"flow_ports_t must match sport and dport in struct bpf_sock_tuple");
99 
100 typedef int ret_t;
101 
102 /* This is a bit of a hack. We need a return value which allows us to
103  * indicate that the regular flow of the program should continue,
104  * while allowing functions to use XDP_PASS and XDP_DROP, etc.
105  */
106 static const ret_t CONTINUE_PROCESSING = -1;
107 
108 /* Convenience macro to call functions which return ret_t.
109  */
110 #define MAYBE_RETURN(x)                           \
111 	do {                                      \
112 		ret_t __ret = x;                  \
113 		if (__ret != CONTINUE_PROCESSING) \
114 			return __ret;             \
115 	} while (0)
116 
117 /* Linux packet pointers are either aligned to NET_IP_ALIGN (aka 2 bytes),
118  * or not aligned if the arch supports efficient unaligned access.
119  *
120  * Since the verifier ensures that eBPF packet accesses follow these rules,
121  * we can tell LLVM to emit code as if we always had a larger alignment.
122  * It will yell at us if we end up on a platform where this is not valid.
123  */
124 typedef uint8_t *net_ptr __attribute__((align_value(8)));
125 
126 typedef struct buf {
127 	struct __sk_buff *skb;
128 	net_ptr head;
129 	/* NB: tail musn't have alignment other than 1, otherwise
130 	* LLVM will go and eliminate code, e.g. when checking packet lengths.
131 	*/
132 	uint8_t *const tail;
133 } buf_t;
134 
buf_off(const buf_t * buf)135 static __always_inline size_t buf_off(const buf_t *buf)
136 {
137 	/* Clang seems to optimize constructs like
138 	 *    a - b + c
139 	 * if c is known:
140 	 *    r? = c
141 	 *    r? -= b
142 	 *    r? += a
143 	 *
144 	 * This is a problem if a and b are packet pointers,
145 	 * since the verifier allows subtracting two pointers to
146 	 * get a scalar, but not a scalar and a pointer.
147 	 *
148 	 * Use inline asm to break this optimization.
149 	 */
150 	size_t off = (size_t)buf->head;
151 	asm("%0 -= %1" : "+r"(off) : "r"(buf->skb->data));
152 	return off;
153 }
154 
buf_copy(buf_t * buf,void * dst,size_t len)155 static __always_inline bool buf_copy(buf_t *buf, void *dst, size_t len)
156 {
157 	if (bpf_skb_load_bytes(buf->skb, buf_off(buf), dst, len)) {
158 		return false;
159 	}
160 
161 	buf->head += len;
162 	return true;
163 }
164 
buf_skip(buf_t * buf,const size_t len)165 static __always_inline bool buf_skip(buf_t *buf, const size_t len)
166 {
167 	/* Check whether off + len is valid in the non-linear part. */
168 	if (buf_off(buf) + len > buf->skb->len) {
169 		return false;
170 	}
171 
172 	buf->head += len;
173 	return true;
174 }
175 
176 /* Returns a pointer to the start of buf, or NULL if len is
177  * larger than the remaining data. Consumes len bytes on a successful
178  * call.
179  *
180  * If scratch is not NULL, the function will attempt to load non-linear
181  * data via bpf_skb_load_bytes. On success, scratch is returned.
182  */
buf_assign(buf_t * buf,const size_t len,void * scratch)183 static __always_inline void *buf_assign(buf_t *buf, const size_t len, void *scratch)
184 {
185 	if (buf->head + len > buf->tail) {
186 		if (scratch == NULL) {
187 			return NULL;
188 		}
189 
190 		return buf_copy(buf, scratch, len) ? scratch : NULL;
191 	}
192 
193 	void *ptr = buf->head;
194 	buf->head += len;
195 	return ptr;
196 }
197 
pkt_skip_ipv4_options(buf_t * buf,const struct iphdr * ipv4)198 static INLINING bool pkt_skip_ipv4_options(buf_t *buf, const struct iphdr *ipv4)
199 {
200 	if (ipv4->ihl <= 5) {
201 		return true;
202 	}
203 
204 	return buf_skip(buf, (ipv4->ihl - 5) * 4);
205 }
206 
ipv4_is_fragment(const struct iphdr * ip)207 static INLINING bool ipv4_is_fragment(const struct iphdr *ip)
208 {
209 	uint16_t frag_off = ip->frag_off & bpf_htons(IP_OFFSET_MASK);
210 	return (ip->frag_off & bpf_htons(IP_MF)) != 0 || frag_off > 0;
211 }
212 
pkt_parse_ipv4(buf_t * pkt,struct iphdr * scratch)213 static __always_inline struct iphdr *pkt_parse_ipv4(buf_t *pkt, struct iphdr *scratch)
214 {
215 	struct iphdr *ipv4 = buf_assign(pkt, sizeof(*ipv4), scratch);
216 	if (ipv4 == NULL) {
217 		return NULL;
218 	}
219 
220 	if (ipv4->ihl < 5) {
221 		return NULL;
222 	}
223 
224 	if (!pkt_skip_ipv4_options(pkt, ipv4)) {
225 		return NULL;
226 	}
227 
228 	return ipv4;
229 }
230 
231 /* Parse the L4 ports from a packet, assuming a layout like TCP or UDP. */
pkt_parse_icmp_l4_ports(buf_t * pkt,flow_ports_t * ports)232 static INLINING bool pkt_parse_icmp_l4_ports(buf_t *pkt, flow_ports_t *ports)
233 {
234 	if (!buf_copy(pkt, ports, sizeof(*ports))) {
235 		return false;
236 	}
237 
238 	/* Ports in the L4 headers are reversed, since we are parsing an ICMP
239 	 * payload which is going towards the eyeball.
240 	 */
241 	uint16_t dst = ports->src;
242 	ports->src = ports->dst;
243 	ports->dst = dst;
244 	return true;
245 }
246 
pkt_checksum_fold(uint32_t csum)247 static INLINING uint16_t pkt_checksum_fold(uint32_t csum)
248 {
249 	/* The highest reasonable value for an IPv4 header
250 	 * checksum requires two folds, so we just do that always.
251 	 */
252 	csum = (csum & 0xffff) + (csum >> 16);
253 	csum = (csum & 0xffff) + (csum >> 16);
254 	return (uint16_t)~csum;
255 }
256 
pkt_ipv4_checksum(struct iphdr * iph)257 static INLINING void pkt_ipv4_checksum(struct iphdr *iph)
258 {
259 	iph->check = 0;
260 
261 	/* An IP header without options is 20 bytes. Two of those
262 	 * are the checksum, which we always set to zero. Hence,
263 	 * the maximum accumulated value is 18 / 2 * 0xffff = 0x8fff7,
264 	 * which fits in 32 bit.
265 	 */
266 	_Static_assert(sizeof(struct iphdr) == 20, "iphdr must be 20 bytes");
267 	uint32_t acc = 0;
268 	uint16_t *ipw = (uint16_t *)iph;
269 
270 #pragma clang loop unroll(full)
271 	for (size_t i = 0; i < sizeof(struct iphdr) / 2; i++) {
272 		acc += ipw[i];
273 	}
274 
275 	iph->check = pkt_checksum_fold(acc);
276 }
277 
278 static INLINING
pkt_skip_ipv6_extension_headers(buf_t * pkt,const struct ipv6hdr * ipv6,uint8_t * upper_proto,bool * is_fragment)279 bool pkt_skip_ipv6_extension_headers(buf_t *pkt,
280 				     const struct ipv6hdr *ipv6,
281 				     uint8_t *upper_proto,
282 				     bool *is_fragment)
283 {
284 	/* We understand five extension headers.
285 	 * https://tools.ietf.org/html/rfc8200#section-4.1 states that all
286 	 * headers should occur once, except Destination Options, which may
287 	 * occur twice. Hence we give up after 6 headers.
288 	 */
289 	struct {
290 		uint8_t next;
291 		uint8_t len;
292 	} exthdr = {
293 		.next = ipv6->nexthdr,
294 	};
295 	*is_fragment = false;
296 
297 #pragma clang loop unroll(full)
298 	for (int i = 0; i < 6; i++) {
299 		switch (exthdr.next) {
300 		case IPPROTO_FRAGMENT:
301 			*is_fragment = true;
302 			/* NB: We don't check that hdrlen == 0 as per spec. */
303 			/* fallthrough; */
304 
305 		case IPPROTO_HOPOPTS:
306 		case IPPROTO_ROUTING:
307 		case IPPROTO_DSTOPTS:
308 		case IPPROTO_MH:
309 			if (!buf_copy(pkt, &exthdr, sizeof(exthdr))) {
310 				return false;
311 			}
312 
313 			/* hdrlen is in 8-octet units, and excludes the first 8 octets. */
314 			if (!buf_skip(pkt,
315 				      (exthdr.len + 1) * 8 - sizeof(exthdr))) {
316 				return false;
317 			}
318 
319 			/* Decode next header */
320 			break;
321 
322 		default:
323 			/* The next header is not one of the known extension
324 			 * headers, treat it as the upper layer header.
325 			 *
326 			 * This handles IPPROTO_NONE.
327 			 *
328 			 * Encapsulating Security Payload (50) and Authentication
329 			 * Header (51) also end up here (and will trigger an
330 			 * unknown proto error later). They have a custom header
331 			 * format and seem too esoteric to care about.
332 			 */
333 			*upper_proto = exthdr.next;
334 			return true;
335 		}
336 	}
337 
338 	/* We never found an upper layer header. */
339 	return false;
340 }
341 
342 /* This function has to be inlined, because the verifier otherwise rejects it
343  * due to returning a pointer to the stack. This is technically correct, since
344  * scratch is allocated on the stack. However, this usage should be safe since
345  * it's the callers stack after all.
346  */
347 static __always_inline struct ipv6hdr *
pkt_parse_ipv6(buf_t * pkt,struct ipv6hdr * scratch,uint8_t * proto,bool * is_fragment)348 pkt_parse_ipv6(buf_t *pkt, struct ipv6hdr *scratch, uint8_t *proto,
349 	       bool *is_fragment)
350 {
351 	struct ipv6hdr *ipv6 = buf_assign(pkt, sizeof(*ipv6), scratch);
352 	if (ipv6 == NULL) {
353 		return NULL;
354 	}
355 
356 	if (!pkt_skip_ipv6_extension_headers(pkt, ipv6, proto, is_fragment)) {
357 		return NULL;
358 	}
359 
360 	return ipv6;
361 }
362 
363 /* Global metrics, per CPU
364  */
365 struct {
366 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
367 	__uint(max_entries, 1);
368 	__type(key, unsigned int);
369 	__type(value, metrics_t);
370 } metrics_map SEC(".maps");
371 
get_global_metrics(void)372 static INLINING metrics_t *get_global_metrics(void)
373 {
374 	uint64_t key = 0;
375 	return bpf_map_lookup_elem(&metrics_map, &key);
376 }
377 
accept_locally(struct __sk_buff * skb,encap_headers_t * encap)378 static INLINING ret_t accept_locally(struct __sk_buff *skb, encap_headers_t *encap)
379 {
380 	const int payload_off =
381 		sizeof(*encap) +
382 		sizeof(struct in_addr) * encap->unigue.hop_count;
383 	int32_t encap_overhead = payload_off - sizeof(struct ethhdr);
384 
385 	// Changing the ethertype if the encapsulated packet is ipv6
386 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
387 		encap->eth.h_proto = bpf_htons(ETH_P_IPV6);
388 	}
389 
390 	if (bpf_skb_adjust_room(skb, -encap_overhead, BPF_ADJ_ROOM_MAC,
391 				BPF_F_ADJ_ROOM_FIXED_GSO |
392 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
393 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_DEC))
394 		return TC_ACT_SHOT;
395 
396 	return bpf_redirect(skb->ifindex, BPF_F_INGRESS);
397 }
398 
forward_with_gre(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)399 static INLINING ret_t forward_with_gre(struct __sk_buff *skb, encap_headers_t *encap,
400 				       struct in_addr *next_hop, metrics_t *metrics)
401 {
402 	metrics->forwarded_packets_total_gre++;
403 
404 	const int payload_off =
405 		sizeof(*encap) +
406 		sizeof(struct in_addr) * encap->unigue.hop_count;
407 	int32_t encap_overhead =
408 		payload_off - sizeof(struct ethhdr) - sizeof(struct iphdr);
409 	int32_t delta = sizeof(struct gre_base_hdr) - encap_overhead;
410 	uint16_t proto = ETH_P_IP;
411 	uint32_t mtu_len = 0;
412 
413 	/* Loop protection: the inner packet's TTL is decremented as a safeguard
414 	 * against any forwarding loop. As the only interesting field is the TTL
415 	 * hop limit for IPv6, it is easier to use bpf_skb_load_bytes/bpf_skb_store_bytes
416 	 * as they handle the split packets if needed (no need for the data to be
417 	 * in the linear section).
418 	 */
419 	if (encap->gue.proto_ctype == IPPROTO_IPV6) {
420 		proto = ETH_P_IPV6;
421 		uint8_t ttl;
422 		int rc;
423 
424 		rc = bpf_skb_load_bytes(
425 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
426 			&ttl, 1);
427 		if (rc != 0) {
428 			metrics->errors_total_malformed_encapsulation++;
429 			return TC_ACT_SHOT;
430 		}
431 
432 		if (ttl == 0) {
433 			metrics->errors_total_redirect_loop++;
434 			return TC_ACT_SHOT;
435 		}
436 
437 		ttl--;
438 		rc = bpf_skb_store_bytes(
439 			skb, payload_off + offsetof(struct ipv6hdr, hop_limit),
440 			&ttl, 1, 0);
441 		if (rc != 0) {
442 			metrics->errors_total_malformed_encapsulation++;
443 			return TC_ACT_SHOT;
444 		}
445 	} else {
446 		uint8_t ttl;
447 		int rc;
448 
449 		rc = bpf_skb_load_bytes(
450 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl,
451 			1);
452 		if (rc != 0) {
453 			metrics->errors_total_malformed_encapsulation++;
454 			return TC_ACT_SHOT;
455 		}
456 
457 		if (ttl == 0) {
458 			metrics->errors_total_redirect_loop++;
459 			return TC_ACT_SHOT;
460 		}
461 
462 		/* IPv4 also has a checksum to patch. While the TTL is only one byte,
463 		 * this function only works for 2 and 4 bytes arguments (the result is
464 		 * the same).
465 		 */
466 		rc = bpf_l3_csum_replace(
467 			skb, payload_off + offsetof(struct iphdr, check), ttl,
468 			ttl - 1, 2);
469 		if (rc != 0) {
470 			metrics->errors_total_malformed_encapsulation++;
471 			return TC_ACT_SHOT;
472 		}
473 
474 		ttl--;
475 		rc = bpf_skb_store_bytes(
476 			skb, payload_off + offsetof(struct iphdr, ttl), &ttl, 1,
477 			0);
478 		if (rc != 0) {
479 			metrics->errors_total_malformed_encapsulation++;
480 			return TC_ACT_SHOT;
481 		}
482 	}
483 
484 	if (bpf_check_mtu(skb, skb->ifindex, &mtu_len, delta, 0)) {
485 		metrics->errors_total_encap_mtu_violate++;
486 		return TC_ACT_SHOT;
487 	}
488 
489 	if (bpf_skb_adjust_room(skb, delta, BPF_ADJ_ROOM_NET,
490 				BPF_F_ADJ_ROOM_FIXED_GSO |
491 				BPF_F_ADJ_ROOM_NO_CSUM_RESET) ||
492 	    bpf_csum_level(skb, BPF_CSUM_LEVEL_INC)) {
493 		metrics->errors_total_encap_adjust_failed++;
494 		return TC_ACT_SHOT;
495 	}
496 
497 	if (bpf_skb_pull_data(skb, sizeof(encap_gre_t))) {
498 		metrics->errors_total_encap_buffer_too_small++;
499 		return TC_ACT_SHOT;
500 	}
501 
502 	buf_t pkt = {
503 		.skb = skb,
504 		.head = (uint8_t *)(long)skb->data,
505 		.tail = (uint8_t *)(long)skb->data_end,
506 	};
507 
508 	encap_gre_t *encap_gre = buf_assign(&pkt, sizeof(encap_gre_t), NULL);
509 	if (encap_gre == NULL) {
510 		metrics->errors_total_encap_buffer_too_small++;
511 		return TC_ACT_SHOT;
512 	}
513 
514 	encap_gre->ip.protocol = IPPROTO_GRE;
515 	encap_gre->ip.daddr = next_hop->s_addr;
516 	encap_gre->ip.saddr = ENCAPSULATION_IP;
517 	encap_gre->ip.tot_len =
518 		bpf_htons(bpf_ntohs(encap_gre->ip.tot_len) + delta);
519 	encap_gre->gre.flags = 0;
520 	encap_gre->gre.protocol = bpf_htons(proto);
521 	pkt_ipv4_checksum((void *)&encap_gre->ip);
522 
523 	return bpf_redirect(skb->ifindex, 0);
524 }
525 
forward_to_next_hop(struct __sk_buff * skb,encap_headers_t * encap,struct in_addr * next_hop,metrics_t * metrics)526 static INLINING ret_t forward_to_next_hop(struct __sk_buff *skb, encap_headers_t *encap,
527 					  struct in_addr *next_hop, metrics_t *metrics)
528 {
529 	/* swap L2 addresses */
530 	/* This assumes that packets are received from a router.
531 	 * So just swapping the MAC addresses here will make the packet go back to
532 	 * the router, which will send it to the appropriate machine.
533 	 */
534 	unsigned char temp[ETH_ALEN];
535 	memcpy(temp, encap->eth.h_dest, sizeof(temp));
536 	memcpy(encap->eth.h_dest, encap->eth.h_source,
537 	       sizeof(encap->eth.h_dest));
538 	memcpy(encap->eth.h_source, temp, sizeof(encap->eth.h_source));
539 
540 	if (encap->unigue.next_hop == encap->unigue.hop_count - 1 &&
541 	    encap->unigue.last_hop_gre) {
542 		return forward_with_gre(skb, encap, next_hop, metrics);
543 	}
544 
545 	metrics->forwarded_packets_total_gue++;
546 	uint32_t old_saddr = encap->ip.saddr;
547 	encap->ip.saddr = encap->ip.daddr;
548 	encap->ip.daddr = next_hop->s_addr;
549 	if (encap->unigue.next_hop < encap->unigue.hop_count) {
550 		encap->unigue.next_hop++;
551 	}
552 
553 	/* Remove ip->saddr, add next_hop->s_addr */
554 	const uint64_t off = offsetof(typeof(*encap), ip.check);
555 	int ret = bpf_l3_csum_replace(skb, off, old_saddr, next_hop->s_addr, 4);
556 	if (ret < 0) {
557 		return TC_ACT_SHOT;
558 	}
559 
560 	return bpf_redirect(skb->ifindex, 0);
561 }
562 
skip_next_hops(buf_t * pkt,int n)563 static INLINING ret_t skip_next_hops(buf_t *pkt, int n)
564 {
565 	switch (n) {
566 	case 1:
567 		if (!buf_skip(pkt, sizeof(struct in_addr)))
568 			return TC_ACT_SHOT;
569 	case 0:
570 		return CONTINUE_PROCESSING;
571 
572 	default:
573 		return TC_ACT_SHOT;
574 	}
575 }
576 
577 /* Get the next hop from the GLB header.
578  *
579  * Sets next_hop->s_addr to 0 if there are no more hops left.
580  * pkt is positioned just after the variable length GLB header
581  * iff the call is successful.
582  */
get_next_hop(buf_t * pkt,encap_headers_t * encap,struct in_addr * next_hop)583 static INLINING ret_t get_next_hop(buf_t *pkt, encap_headers_t *encap,
584 				   struct in_addr *next_hop)
585 {
586 	if (encap->unigue.next_hop > encap->unigue.hop_count) {
587 		return TC_ACT_SHOT;
588 	}
589 
590 	/* Skip "used" next hops. */
591 	MAYBE_RETURN(skip_next_hops(pkt, encap->unigue.next_hop));
592 
593 	if (encap->unigue.next_hop == encap->unigue.hop_count) {
594 		/* No more next hops, we are at the end of the GLB header. */
595 		next_hop->s_addr = 0;
596 		return CONTINUE_PROCESSING;
597 	}
598 
599 	if (!buf_copy(pkt, next_hop, sizeof(*next_hop))) {
600 		return TC_ACT_SHOT;
601 	}
602 
603 	/* Skip the remainig next hops (may be zero). */
604 	return skip_next_hops(pkt, encap->unigue.hop_count -
605 					   encap->unigue.next_hop - 1);
606 }
607 
608 /* Fill a bpf_sock_tuple to be used with the socket lookup functions.
609  * This is a kludge that let's us work around verifier limitations:
610  *
611  *    fill_tuple(&t, foo, sizeof(struct iphdr), 123, 321)
612  *
613  * clang will substitue a costant for sizeof, which allows the verifier
614  * to track it's value. Based on this, it can figure out the constant
615  * return value, and calling code works while still being "generic" to
616  * IPv4 and IPv6.
617  */
fill_tuple(struct bpf_sock_tuple * tuple,void * iph,uint64_t iphlen,uint16_t sport,uint16_t dport)618 static INLINING uint64_t fill_tuple(struct bpf_sock_tuple *tuple, void *iph,
619 				    uint64_t iphlen, uint16_t sport, uint16_t dport)
620 {
621 	switch (iphlen) {
622 	case sizeof(struct iphdr): {
623 		struct iphdr *ipv4 = (struct iphdr *)iph;
624 		tuple->ipv4.daddr = ipv4->daddr;
625 		tuple->ipv4.saddr = ipv4->saddr;
626 		tuple->ipv4.sport = sport;
627 		tuple->ipv4.dport = dport;
628 		return sizeof(tuple->ipv4);
629 	}
630 
631 	case sizeof(struct ipv6hdr): {
632 		struct ipv6hdr *ipv6 = (struct ipv6hdr *)iph;
633 		memcpy(&tuple->ipv6.daddr, &ipv6->daddr,
634 		       sizeof(tuple->ipv6.daddr));
635 		memcpy(&tuple->ipv6.saddr, &ipv6->saddr,
636 		       sizeof(tuple->ipv6.saddr));
637 		tuple->ipv6.sport = sport;
638 		tuple->ipv6.dport = dport;
639 		return sizeof(tuple->ipv6);
640 	}
641 
642 	default:
643 		return 0;
644 	}
645 }
646 
classify_tcp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen,void * iph,struct tcphdr * tcp)647 static INLINING verdict_t classify_tcp(struct __sk_buff *skb,
648 				       struct bpf_sock_tuple *tuple, uint64_t tuplen,
649 				       void *iph, struct tcphdr *tcp)
650 {
651 	struct bpf_sock *sk =
652 		bpf_skc_lookup_tcp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
653 	if (sk == NULL) {
654 		return UNKNOWN;
655 	}
656 
657 	if (sk->state != BPF_TCP_LISTEN) {
658 		bpf_sk_release(sk);
659 		return ESTABLISHED;
660 	}
661 
662 	if (iph != NULL && tcp != NULL) {
663 		/* Kludge: we've run out of arguments, but need the length of the ip header. */
664 		uint64_t iphlen = sizeof(struct iphdr);
665 		if (tuplen == sizeof(tuple->ipv6)) {
666 			iphlen = sizeof(struct ipv6hdr);
667 		}
668 
669 		if (bpf_tcp_check_syncookie(sk, iph, iphlen, tcp,
670 					    sizeof(*tcp)) == 0) {
671 			bpf_sk_release(sk);
672 			return SYN_COOKIE;
673 		}
674 	}
675 
676 	bpf_sk_release(sk);
677 	return UNKNOWN;
678 }
679 
classify_udp(struct __sk_buff * skb,struct bpf_sock_tuple * tuple,uint64_t tuplen)680 static INLINING verdict_t classify_udp(struct __sk_buff *skb,
681 				       struct bpf_sock_tuple *tuple, uint64_t tuplen)
682 {
683 	struct bpf_sock *sk =
684 		bpf_sk_lookup_udp(skb, tuple, tuplen, BPF_F_CURRENT_NETNS, 0);
685 	if (sk == NULL) {
686 		return UNKNOWN;
687 	}
688 
689 	if (sk->state == BPF_TCP_ESTABLISHED) {
690 		bpf_sk_release(sk);
691 		return ESTABLISHED;
692 	}
693 
694 	bpf_sk_release(sk);
695 	return UNKNOWN;
696 }
697 
classify_icmp(struct __sk_buff * skb,uint8_t proto,struct bpf_sock_tuple * tuple,uint64_t tuplen,metrics_t * metrics)698 static INLINING verdict_t classify_icmp(struct __sk_buff *skb, uint8_t proto,
699 					struct bpf_sock_tuple *tuple, uint64_t tuplen,
700 					metrics_t *metrics)
701 {
702 	switch (proto) {
703 	case IPPROTO_TCP:
704 		return classify_tcp(skb, tuple, tuplen, NULL, NULL);
705 
706 	case IPPROTO_UDP:
707 		return classify_udp(skb, tuple, tuplen);
708 
709 	default:
710 		metrics->errors_total_malformed_icmp++;
711 		return INVALID;
712 	}
713 }
714 
process_icmpv4(buf_t * pkt,metrics_t * metrics)715 static INLINING verdict_t process_icmpv4(buf_t *pkt, metrics_t *metrics)
716 {
717 	struct icmphdr icmp;
718 	if (!buf_copy(pkt, &icmp, sizeof(icmp))) {
719 		metrics->errors_total_malformed_icmp++;
720 		return INVALID;
721 	}
722 
723 	/* We should never receive encapsulated echo replies. */
724 	if (icmp.type == ICMP_ECHOREPLY) {
725 		metrics->errors_total_icmp_echo_replies++;
726 		return INVALID;
727 	}
728 
729 	if (icmp.type == ICMP_ECHO) {
730 		return ECHO_REQUEST;
731 	}
732 
733 	if (icmp.type != ICMP_DEST_UNREACH || icmp.code != ICMP_FRAG_NEEDED) {
734 		metrics->errors_total_unwanted_icmp++;
735 		return INVALID;
736 	}
737 
738 	struct iphdr _ip4;
739 	const struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
740 	if (ipv4 == NULL) {
741 		metrics->errors_total_malformed_icmp_pkt_too_big++;
742 		return INVALID;
743 	}
744 
745 	/* The source address in the outer IP header is from the entity that
746 	 * originated the ICMP message. Use the original IP header to restore
747 	 * the correct flow tuple.
748 	 */
749 	struct bpf_sock_tuple tuple;
750 	tuple.ipv4.saddr = ipv4->daddr;
751 	tuple.ipv4.daddr = ipv4->saddr;
752 
753 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv4.sport)) {
754 		metrics->errors_total_malformed_icmp_pkt_too_big++;
755 		return INVALID;
756 	}
757 
758 	return classify_icmp(pkt->skb, ipv4->protocol, &tuple,
759 			     sizeof(tuple.ipv4), metrics);
760 }
761 
process_icmpv6(buf_t * pkt,metrics_t * metrics)762 static INLINING verdict_t process_icmpv6(buf_t *pkt, metrics_t *metrics)
763 {
764 	struct icmp6hdr icmp6;
765 	if (!buf_copy(pkt, &icmp6, sizeof(icmp6))) {
766 		metrics->errors_total_malformed_icmp++;
767 		return INVALID;
768 	}
769 
770 	/* We should never receive encapsulated echo replies. */
771 	if (icmp6.icmp6_type == ICMPV6_ECHO_REPLY) {
772 		metrics->errors_total_icmp_echo_replies++;
773 		return INVALID;
774 	}
775 
776 	if (icmp6.icmp6_type == ICMPV6_ECHO_REQUEST) {
777 		return ECHO_REQUEST;
778 	}
779 
780 	if (icmp6.icmp6_type != ICMPV6_PKT_TOOBIG) {
781 		metrics->errors_total_unwanted_icmp++;
782 		return INVALID;
783 	}
784 
785 	bool is_fragment;
786 	uint8_t l4_proto;
787 	struct ipv6hdr _ipv6;
788 	const struct ipv6hdr *ipv6 =
789 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
790 	if (ipv6 == NULL) {
791 		metrics->errors_total_malformed_icmp_pkt_too_big++;
792 		return INVALID;
793 	}
794 
795 	if (is_fragment) {
796 		metrics->errors_total_fragmented_ip++;
797 		return INVALID;
798 	}
799 
800 	/* Swap source and dest addresses. */
801 	struct bpf_sock_tuple tuple;
802 	memcpy(&tuple.ipv6.saddr, &ipv6->daddr, sizeof(tuple.ipv6.saddr));
803 	memcpy(&tuple.ipv6.daddr, &ipv6->saddr, sizeof(tuple.ipv6.daddr));
804 
805 	if (!pkt_parse_icmp_l4_ports(pkt, (flow_ports_t *)&tuple.ipv6.sport)) {
806 		metrics->errors_total_malformed_icmp_pkt_too_big++;
807 		return INVALID;
808 	}
809 
810 	return classify_icmp(pkt->skb, l4_proto, &tuple, sizeof(tuple.ipv6),
811 			     metrics);
812 }
813 
process_tcp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)814 static INLINING verdict_t process_tcp(buf_t *pkt, void *iph, uint64_t iphlen,
815 				      metrics_t *metrics)
816 {
817 	metrics->l4_protocol_packets_total_tcp++;
818 
819 	struct tcphdr _tcp;
820 	struct tcphdr *tcp = buf_assign(pkt, sizeof(_tcp), &_tcp);
821 	if (tcp == NULL) {
822 		metrics->errors_total_malformed_tcp++;
823 		return INVALID;
824 	}
825 
826 	if (tcp->syn) {
827 		return SYN;
828 	}
829 
830 	struct bpf_sock_tuple tuple;
831 	uint64_t tuplen =
832 		fill_tuple(&tuple, iph, iphlen, tcp->source, tcp->dest);
833 	return classify_tcp(pkt->skb, &tuple, tuplen, iph, tcp);
834 }
835 
process_udp(buf_t * pkt,void * iph,uint64_t iphlen,metrics_t * metrics)836 static INLINING verdict_t process_udp(buf_t *pkt, void *iph, uint64_t iphlen,
837 				      metrics_t *metrics)
838 {
839 	metrics->l4_protocol_packets_total_udp++;
840 
841 	struct udphdr _udp;
842 	struct udphdr *udph = buf_assign(pkt, sizeof(_udp), &_udp);
843 	if (udph == NULL) {
844 		metrics->errors_total_malformed_udp++;
845 		return INVALID;
846 	}
847 
848 	struct bpf_sock_tuple tuple;
849 	uint64_t tuplen =
850 		fill_tuple(&tuple, iph, iphlen, udph->source, udph->dest);
851 	return classify_udp(pkt->skb, &tuple, tuplen);
852 }
853 
process_ipv4(buf_t * pkt,metrics_t * metrics)854 static INLINING verdict_t process_ipv4(buf_t *pkt, metrics_t *metrics)
855 {
856 	metrics->l3_protocol_packets_total_ipv4++;
857 
858 	struct iphdr _ip4;
859 	struct iphdr *ipv4 = pkt_parse_ipv4(pkt, &_ip4);
860 	if (ipv4 == NULL) {
861 		metrics->errors_total_malformed_ip++;
862 		return INVALID;
863 	}
864 
865 	if (ipv4->version != 4) {
866 		metrics->errors_total_malformed_ip++;
867 		return INVALID;
868 	}
869 
870 	if (ipv4_is_fragment(ipv4)) {
871 		metrics->errors_total_fragmented_ip++;
872 		return INVALID;
873 	}
874 
875 	switch (ipv4->protocol) {
876 	case IPPROTO_ICMP:
877 		return process_icmpv4(pkt, metrics);
878 
879 	case IPPROTO_TCP:
880 		return process_tcp(pkt, ipv4, sizeof(*ipv4), metrics);
881 
882 	case IPPROTO_UDP:
883 		return process_udp(pkt, ipv4, sizeof(*ipv4), metrics);
884 
885 	default:
886 		metrics->errors_total_unknown_l4_proto++;
887 		return INVALID;
888 	}
889 }
890 
process_ipv6(buf_t * pkt,metrics_t * metrics)891 static INLINING verdict_t process_ipv6(buf_t *pkt, metrics_t *metrics)
892 {
893 	metrics->l3_protocol_packets_total_ipv6++;
894 
895 	uint8_t l4_proto;
896 	bool is_fragment;
897 	struct ipv6hdr _ipv6;
898 	struct ipv6hdr *ipv6 =
899 		pkt_parse_ipv6(pkt, &_ipv6, &l4_proto, &is_fragment);
900 	if (ipv6 == NULL) {
901 		metrics->errors_total_malformed_ip++;
902 		return INVALID;
903 	}
904 
905 	if (ipv6->version != 6) {
906 		metrics->errors_total_malformed_ip++;
907 		return INVALID;
908 	}
909 
910 	if (is_fragment) {
911 		metrics->errors_total_fragmented_ip++;
912 		return INVALID;
913 	}
914 
915 	switch (l4_proto) {
916 	case IPPROTO_ICMPV6:
917 		return process_icmpv6(pkt, metrics);
918 
919 	case IPPROTO_TCP:
920 		return process_tcp(pkt, ipv6, sizeof(*ipv6), metrics);
921 
922 	case IPPROTO_UDP:
923 		return process_udp(pkt, ipv6, sizeof(*ipv6), metrics);
924 
925 	default:
926 		metrics->errors_total_unknown_l4_proto++;
927 		return INVALID;
928 	}
929 }
930 
931 SEC("tc")
cls_redirect(struct __sk_buff * skb)932 int cls_redirect(struct __sk_buff *skb)
933 {
934 	metrics_t *metrics = get_global_metrics();
935 	if (metrics == NULL) {
936 		return TC_ACT_SHOT;
937 	}
938 
939 	metrics->processed_packets_total++;
940 
941 	/* Pass bogus packets as long as we're not sure they're
942 	 * destined for us.
943 	 */
944 	if (skb->protocol != bpf_htons(ETH_P_IP)) {
945 		return TC_ACT_OK;
946 	}
947 
948 	encap_headers_t *encap;
949 
950 	/* Make sure that all encapsulation headers are available in
951 	 * the linear portion of the skb. This makes it easy to manipulate them.
952 	 */
953 	if (bpf_skb_pull_data(skb, sizeof(*encap))) {
954 		return TC_ACT_OK;
955 	}
956 
957 	buf_t pkt = {
958 		.skb = skb,
959 		.head = (uint8_t *)(long)skb->data,
960 		.tail = (uint8_t *)(long)skb->data_end,
961 	};
962 
963 	encap = buf_assign(&pkt, sizeof(*encap), NULL);
964 	if (encap == NULL) {
965 		return TC_ACT_OK;
966 	}
967 
968 	if (encap->ip.ihl != 5) {
969 		/* We never have any options. */
970 		return TC_ACT_OK;
971 	}
972 
973 	if (encap->ip.daddr != ENCAPSULATION_IP ||
974 	    encap->ip.protocol != IPPROTO_UDP) {
975 		return TC_ACT_OK;
976 	}
977 
978 	/* TODO Check UDP length? */
979 	if (encap->udp.dest != ENCAPSULATION_PORT) {
980 		return TC_ACT_OK;
981 	}
982 
983 	/* We now know that the packet is destined to us, we can
984 	 * drop bogus ones.
985 	 */
986 	if (ipv4_is_fragment((void *)&encap->ip)) {
987 		metrics->errors_total_fragmented_ip++;
988 		return TC_ACT_SHOT;
989 	}
990 
991 	if (encap->gue.variant != 0) {
992 		metrics->errors_total_malformed_encapsulation++;
993 		return TC_ACT_SHOT;
994 	}
995 
996 	if (encap->gue.control != 0) {
997 		metrics->errors_total_malformed_encapsulation++;
998 		return TC_ACT_SHOT;
999 	}
1000 
1001 	if (encap->gue.flags != 0) {
1002 		metrics->errors_total_malformed_encapsulation++;
1003 		return TC_ACT_SHOT;
1004 	}
1005 
1006 	if (encap->gue.hlen !=
1007 	    sizeof(encap->unigue) / 4 + encap->unigue.hop_count) {
1008 		metrics->errors_total_malformed_encapsulation++;
1009 		return TC_ACT_SHOT;
1010 	}
1011 
1012 	if (encap->unigue.version != 0) {
1013 		metrics->errors_total_malformed_encapsulation++;
1014 		return TC_ACT_SHOT;
1015 	}
1016 
1017 	if (encap->unigue.reserved != 0) {
1018 		return TC_ACT_SHOT;
1019 	}
1020 
1021 	struct in_addr next_hop;
1022 	MAYBE_RETURN(get_next_hop(&pkt, encap, &next_hop));
1023 
1024 	if (next_hop.s_addr == 0) {
1025 		metrics->accepted_packets_total_last_hop++;
1026 		return accept_locally(skb, encap);
1027 	}
1028 
1029 	verdict_t verdict;
1030 	switch (encap->gue.proto_ctype) {
1031 	case IPPROTO_IPIP:
1032 		verdict = process_ipv4(&pkt, metrics);
1033 		break;
1034 
1035 	case IPPROTO_IPV6:
1036 		verdict = process_ipv6(&pkt, metrics);
1037 		break;
1038 
1039 	default:
1040 		metrics->errors_total_unknown_l3_proto++;
1041 		return TC_ACT_SHOT;
1042 	}
1043 
1044 	switch (verdict) {
1045 	case INVALID:
1046 		/* metrics have already been bumped */
1047 		return TC_ACT_SHOT;
1048 
1049 	case UNKNOWN:
1050 		return forward_to_next_hop(skb, encap, &next_hop, metrics);
1051 
1052 	case ECHO_REQUEST:
1053 		metrics->accepted_packets_total_icmp_echo_request++;
1054 		break;
1055 
1056 	case SYN:
1057 		if (encap->unigue.forward_syn) {
1058 			return forward_to_next_hop(skb, encap, &next_hop,
1059 						   metrics);
1060 		}
1061 
1062 		metrics->accepted_packets_total_syn++;
1063 		break;
1064 
1065 	case SYN_COOKIE:
1066 		metrics->accepted_packets_total_syn_cookies++;
1067 		break;
1068 
1069 	case ESTABLISHED:
1070 		metrics->accepted_packets_total_established++;
1071 		break;
1072 	}
1073 
1074 	return accept_locally(skb, encap);
1075 }
1076