1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Implementation of the Transmission Control Protocol(TCP).
8 *
9 * IPv4 specific functions
10 *
11 * code split from:
12 * linux/ipv4/tcp.c
13 * linux/ipv4/tcp_input.c
14 * linux/ipv4/tcp_output.c
15 *
16 * See tcp.c for author information
17 */
18
19 /*
20 * Changes:
21 * David S. Miller : New socket lookup architecture.
22 * This code is dedicated to John Dyson.
23 * David S. Miller : Change semantics of established hash,
24 * half is devoted to TIME_WAIT sockets
25 * and the rest go in the other half.
26 * Andi Kleen : Add support for syncookies and fixed
27 * some bugs: ip options weren't passed to
28 * the TCP layer, missed a check for an
29 * ACK bit.
30 * Andi Kleen : Implemented fast path mtu discovery.
31 * Fixed many serious bugs in the
32 * request_sock handling and moved
33 * most of it into the af independent code.
34 * Added tail drop and some other bugfixes.
35 * Added new listen semantics.
36 * Mike McLagan : Routing by source
37 * Juan Jose Ciarlante: ip_dynaddr bits
38 * Andi Kleen: various fixes.
39 * Vitaly E. Lavrov : Transparent proxy revived after year
40 * coma.
41 * Andi Kleen : Fix new listen.
42 * Andi Kleen : Fix accept error reporting.
43 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
44 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
45 * a single port at the same time.
46 */
47
48 #define pr_fmt(fmt) "TCP: " fmt
49
50 #include <linux/bottom_half.h>
51 #include <linux/types.h>
52 #include <linux/fcntl.h>
53 #include <linux/module.h>
54 #include <linux/random.h>
55 #include <linux/cache.h>
56 #include <linux/jhash.h>
57 #include <linux/init.h>
58 #include <linux/times.h>
59 #include <linux/slab.h>
60
61 #include <net/net_namespace.h>
62 #include <net/icmp.h>
63 #include <net/inet_hashtables.h>
64 #include <net/tcp.h>
65 #include <net/transp_v6.h>
66 #include <net/ipv6.h>
67 #include <net/inet_common.h>
68 #include <net/timewait_sock.h>
69 #include <net/xfrm.h>
70 #include <net/secure_seq.h>
71 #include <net/busy_poll.h>
72
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/inetdevice.h>
79 #include <linux/btf_ids.h>
80
81 #include <crypto/hash.h>
82 #include <linux/scatterlist.h>
83
84 #include <trace/events/tcp.h>
85
86 #ifdef CONFIG_TCP_MD5SIG
87 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
88 __be32 daddr, __be32 saddr, const struct tcphdr *th);
89 #endif
90
91 struct inet_hashinfo tcp_hashinfo;
92 EXPORT_SYMBOL(tcp_hashinfo);
93
94 static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
95
tcp_v4_init_seq(const struct sk_buff * skb)96 static u32 tcp_v4_init_seq(const struct sk_buff *skb)
97 {
98 return secure_tcp_seq(ip_hdr(skb)->daddr,
99 ip_hdr(skb)->saddr,
100 tcp_hdr(skb)->dest,
101 tcp_hdr(skb)->source);
102 }
103
tcp_v4_init_ts_off(const struct net * net,const struct sk_buff * skb)104 static u32 tcp_v4_init_ts_off(const struct net *net, const struct sk_buff *skb)
105 {
106 return secure_tcp_ts_off(net, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr);
107 }
108
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 int reuse = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tw_reuse);
112 const struct inet_timewait_sock *tw = inet_twsk(sktw);
113 const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
114 struct tcp_sock *tp = tcp_sk(sk);
115
116 if (reuse == 2) {
117 /* Still does not detect *everything* that goes through
118 * lo, since we require a loopback src or dst address
119 * or direct binding to 'lo' interface.
120 */
121 bool loopback = false;
122 if (tw->tw_bound_dev_if == LOOPBACK_IFINDEX)
123 loopback = true;
124 #if IS_ENABLED(CONFIG_IPV6)
125 if (tw->tw_family == AF_INET6) {
126 if (ipv6_addr_loopback(&tw->tw_v6_daddr) ||
127 ipv6_addr_v4mapped_loopback(&tw->tw_v6_daddr) ||
128 ipv6_addr_loopback(&tw->tw_v6_rcv_saddr) ||
129 ipv6_addr_v4mapped_loopback(&tw->tw_v6_rcv_saddr))
130 loopback = true;
131 } else
132 #endif
133 {
134 if (ipv4_is_loopback(tw->tw_daddr) ||
135 ipv4_is_loopback(tw->tw_rcv_saddr))
136 loopback = true;
137 }
138 if (!loopback)
139 reuse = 0;
140 }
141
142 /* With PAWS, it is safe from the viewpoint
143 of data integrity. Even without PAWS it is safe provided sequence
144 spaces do not overlap i.e. at data rates <= 80Mbit/sec.
145
146 Actually, the idea is close to VJ's one, only timestamp cache is
147 held not per host, but per port pair and TW bucket is used as state
148 holder.
149
150 If TW bucket has been already destroyed we fall back to VJ's scheme
151 and use initial timestamp retrieved from peer table.
152 */
153 if (tcptw->tw_ts_recent_stamp &&
154 (!twp || (reuse && time_after32(ktime_get_seconds(),
155 tcptw->tw_ts_recent_stamp)))) {
156 /* In case of repair and re-using TIME-WAIT sockets we still
157 * want to be sure that it is safe as above but honor the
158 * sequence numbers and time stamps set as part of the repair
159 * process.
160 *
161 * Without this check re-using a TIME-WAIT socket with TCP
162 * repair would accumulate a -1 on the repair assigned
163 * sequence number. The first time it is reused the sequence
164 * is -1, the second time -2, etc. This fixes that issue
165 * without appearing to create any others.
166 */
167 if (likely(!tp->repair)) {
168 u32 seq = tcptw->tw_snd_nxt + 65535 + 2;
169
170 if (!seq)
171 seq = 1;
172 WRITE_ONCE(tp->write_seq, seq);
173 tp->rx_opt.ts_recent = tcptw->tw_ts_recent;
174 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
175 }
176 sock_hold(sktw);
177 return 1;
178 }
179
180 return 0;
181 }
182 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
183
tcp_v4_pre_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)184 static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
185 int addr_len)
186 {
187 /* This check is replicated from tcp_v4_connect() and intended to
188 * prevent BPF program called below from accessing bytes that are out
189 * of the bound specified by user in addr_len.
190 */
191 if (addr_len < sizeof(struct sockaddr_in))
192 return -EINVAL;
193
194 sock_owned_by_me(sk);
195
196 return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
197 }
198
199 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)200 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
201 {
202 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
203 struct inet_sock *inet = inet_sk(sk);
204 struct tcp_sock *tp = tcp_sk(sk);
205 __be16 orig_sport, orig_dport;
206 __be32 daddr, nexthop;
207 struct flowi4 *fl4;
208 struct rtable *rt;
209 int err;
210 struct ip_options_rcu *inet_opt;
211 struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row;
212
213 if (addr_len < sizeof(struct sockaddr_in))
214 return -EINVAL;
215
216 if (usin->sin_family != AF_INET)
217 return -EAFNOSUPPORT;
218
219 nexthop = daddr = usin->sin_addr.s_addr;
220 inet_opt = rcu_dereference_protected(inet->inet_opt,
221 lockdep_sock_is_held(sk));
222 if (inet_opt && inet_opt->opt.srr) {
223 if (!daddr)
224 return -EINVAL;
225 nexthop = inet_opt->opt.faddr;
226 }
227
228 orig_sport = inet->inet_sport;
229 orig_dport = usin->sin_port;
230 fl4 = &inet->cork.fl.u.ip4;
231 rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
232 sk->sk_bound_dev_if, IPPROTO_TCP, orig_sport,
233 orig_dport, sk);
234 if (IS_ERR(rt)) {
235 err = PTR_ERR(rt);
236 if (err == -ENETUNREACH)
237 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
238 return err;
239 }
240
241 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
242 ip_rt_put(rt);
243 return -ENETUNREACH;
244 }
245
246 if (!inet_opt || !inet_opt->opt.srr)
247 daddr = fl4->daddr;
248
249 if (!inet->inet_saddr)
250 inet->inet_saddr = fl4->saddr;
251 sk_rcv_saddr_set(sk, inet->inet_saddr);
252
253 if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
254 /* Reset inherited state */
255 tp->rx_opt.ts_recent = 0;
256 tp->rx_opt.ts_recent_stamp = 0;
257 if (likely(!tp->repair))
258 WRITE_ONCE(tp->write_seq, 0);
259 }
260
261 inet->inet_dport = usin->sin_port;
262 sk_daddr_set(sk, daddr);
263
264 inet_csk(sk)->icsk_ext_hdr_len = 0;
265 if (inet_opt)
266 inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
267
268 tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
269
270 /* Socket identity is still unknown (sport may be zero).
271 * However we set state to SYN-SENT and not releasing socket
272 * lock select source port, enter ourselves into the hash tables and
273 * complete initialization after this.
274 */
275 tcp_set_state(sk, TCP_SYN_SENT);
276 err = inet_hash_connect(tcp_death_row, sk);
277 if (err)
278 goto failure;
279
280 sk_set_txhash(sk);
281
282 rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
283 inet->inet_sport, inet->inet_dport, sk);
284 if (IS_ERR(rt)) {
285 err = PTR_ERR(rt);
286 rt = NULL;
287 goto failure;
288 }
289 /* OK, now commit destination to socket. */
290 sk->sk_gso_type = SKB_GSO_TCPV4;
291 sk_setup_caps(sk, &rt->dst);
292 rt = NULL;
293
294 if (likely(!tp->repair)) {
295 if (!tp->write_seq)
296 WRITE_ONCE(tp->write_seq,
297 secure_tcp_seq(inet->inet_saddr,
298 inet->inet_daddr,
299 inet->inet_sport,
300 usin->sin_port));
301 tp->tsoffset = secure_tcp_ts_off(sock_net(sk),
302 inet->inet_saddr,
303 inet->inet_daddr);
304 }
305
306 inet->inet_id = prandom_u32();
307
308 if (tcp_fastopen_defer_connect(sk, &err))
309 return err;
310 if (err)
311 goto failure;
312
313 err = tcp_connect(sk);
314
315 if (err)
316 goto failure;
317
318 return 0;
319
320 failure:
321 /*
322 * This unhashes the socket and releases the local port,
323 * if necessary.
324 */
325 tcp_set_state(sk, TCP_CLOSE);
326 ip_rt_put(rt);
327 sk->sk_route_caps = 0;
328 inet->inet_dport = 0;
329 return err;
330 }
331 EXPORT_SYMBOL(tcp_v4_connect);
332
333 /*
334 * This routine reacts to ICMP_FRAG_NEEDED mtu indications as defined in RFC1191.
335 * It can be called through tcp_release_cb() if socket was owned by user
336 * at the time tcp_v4_err() was called to handle ICMP message.
337 */
tcp_v4_mtu_reduced(struct sock * sk)338 void tcp_v4_mtu_reduced(struct sock *sk)
339 {
340 struct inet_sock *inet = inet_sk(sk);
341 struct dst_entry *dst;
342 u32 mtu;
343
344 if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
345 return;
346 mtu = READ_ONCE(tcp_sk(sk)->mtu_info);
347 dst = inet_csk_update_pmtu(sk, mtu);
348 if (!dst)
349 return;
350
351 /* Something is about to be wrong... Remember soft error
352 * for the case, if this connection will not able to recover.
353 */
354 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
355 sk->sk_err_soft = EMSGSIZE;
356
357 mtu = dst_mtu(dst);
358
359 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
360 ip_sk_accept_pmtu(sk) &&
361 inet_csk(sk)->icsk_pmtu_cookie > mtu) {
362 tcp_sync_mss(sk, mtu);
363
364 /* Resend the TCP packet because it's
365 * clear that the old packet has been
366 * dropped. This is the new "fast" path mtu
367 * discovery.
368 */
369 tcp_simple_retransmit(sk);
370 } /* else let the usual retransmit timer handle it */
371 }
372 EXPORT_SYMBOL(tcp_v4_mtu_reduced);
373
do_redirect(struct sk_buff * skb,struct sock * sk)374 static void do_redirect(struct sk_buff *skb, struct sock *sk)
375 {
376 struct dst_entry *dst = __sk_dst_check(sk, 0);
377
378 if (dst)
379 dst->ops->redirect(dst, sk, skb);
380 }
381
382
383 /* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
tcp_req_err(struct sock * sk,u32 seq,bool abort)384 void tcp_req_err(struct sock *sk, u32 seq, bool abort)
385 {
386 struct request_sock *req = inet_reqsk(sk);
387 struct net *net = sock_net(sk);
388
389 /* ICMPs are not backlogged, hence we cannot get
390 * an established socket here.
391 */
392 if (seq != tcp_rsk(req)->snt_isn) {
393 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 } else if (abort) {
395 /*
396 * Still in SYN_RECV, just remove it silently.
397 * There is no good way to pass the error to the newly
398 * created socket, and POSIX does not want network
399 * errors returned from accept().
400 */
401 inet_csk_reqsk_queue_drop(req->rsk_listener, req);
402 tcp_listendrop(req->rsk_listener);
403 }
404 reqsk_put(req);
405 }
406 EXPORT_SYMBOL(tcp_req_err);
407
408 /* TCP-LD (RFC 6069) logic */
tcp_ld_RTO_revert(struct sock * sk,u32 seq)409 void tcp_ld_RTO_revert(struct sock *sk, u32 seq)
410 {
411 struct inet_connection_sock *icsk = inet_csk(sk);
412 struct tcp_sock *tp = tcp_sk(sk);
413 struct sk_buff *skb;
414 s32 remaining;
415 u32 delta_us;
416
417 if (sock_owned_by_user(sk))
418 return;
419
420 if (seq != tp->snd_una || !icsk->icsk_retransmits ||
421 !icsk->icsk_backoff)
422 return;
423
424 skb = tcp_rtx_queue_head(sk);
425 if (WARN_ON_ONCE(!skb))
426 return;
427
428 icsk->icsk_backoff--;
429 icsk->icsk_rto = tp->srtt_us ? __tcp_set_rto(tp) : TCP_TIMEOUT_INIT;
430 icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
431
432 tcp_mstamp_refresh(tp);
433 delta_us = (u32)(tp->tcp_mstamp - tcp_skb_timestamp_us(skb));
434 remaining = icsk->icsk_rto - usecs_to_jiffies(delta_us);
435
436 if (remaining > 0) {
437 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
438 remaining, TCP_RTO_MAX);
439 } else {
440 /* RTO revert clocked out retransmission.
441 * Will retransmit now.
442 */
443 tcp_retransmit_timer(sk);
444 }
445 }
446 EXPORT_SYMBOL(tcp_ld_RTO_revert);
447
448 /*
449 * This routine is called by the ICMP module when it gets some
450 * sort of error condition. If err < 0 then the socket should
451 * be closed and the error returned to the user. If err > 0
452 * it's just the icmp type << 8 | icmp code. After adjustment
453 * header points to the first 8 bytes of the tcp header. We need
454 * to find the appropriate port.
455 *
456 * The locking strategy used here is very "optimistic". When
457 * someone else accesses the socket the ICMP is just dropped
458 * and for some paths there is no check at all.
459 * A more general error queue to queue errors for later handling
460 * is probably better.
461 *
462 */
463
tcp_v4_err(struct sk_buff * skb,u32 info)464 int tcp_v4_err(struct sk_buff *skb, u32 info)
465 {
466 const struct iphdr *iph = (const struct iphdr *)skb->data;
467 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
468 struct tcp_sock *tp;
469 struct inet_sock *inet;
470 const int type = icmp_hdr(skb)->type;
471 const int code = icmp_hdr(skb)->code;
472 struct sock *sk;
473 struct request_sock *fastopen;
474 u32 seq, snd_una;
475 int err;
476 struct net *net = dev_net(skb->dev);
477
478 sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr,
479 th->dest, iph->saddr, ntohs(th->source),
480 inet_iif(skb), 0);
481 if (!sk) {
482 __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
483 return -ENOENT;
484 }
485 if (sk->sk_state == TCP_TIME_WAIT) {
486 inet_twsk_put(inet_twsk(sk));
487 return 0;
488 }
489 seq = ntohl(th->seq);
490 if (sk->sk_state == TCP_NEW_SYN_RECV) {
491 tcp_req_err(sk, seq, type == ICMP_PARAMETERPROB ||
492 type == ICMP_TIME_EXCEEDED ||
493 (type == ICMP_DEST_UNREACH &&
494 (code == ICMP_NET_UNREACH ||
495 code == ICMP_HOST_UNREACH)));
496 return 0;
497 }
498
499 bh_lock_sock(sk);
500 /* If too many ICMPs get dropped on busy
501 * servers this needs to be solved differently.
502 * We do take care of PMTU discovery (RFC1191) special case :
503 * we can receive locally generated ICMP messages while socket is held.
504 */
505 if (sock_owned_by_user(sk)) {
506 if (!(type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED))
507 __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS);
508 }
509 if (sk->sk_state == TCP_CLOSE)
510 goto out;
511
512 if (static_branch_unlikely(&ip4_min_ttl)) {
513 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
514 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
515 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
516 goto out;
517 }
518 }
519
520 tp = tcp_sk(sk);
521 /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */
522 fastopen = rcu_dereference(tp->fastopen_rsk);
523 snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una;
524 if (sk->sk_state != TCP_LISTEN &&
525 !between(seq, snd_una, tp->snd_nxt)) {
526 __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS);
527 goto out;
528 }
529
530 switch (type) {
531 case ICMP_REDIRECT:
532 if (!sock_owned_by_user(sk))
533 do_redirect(skb, sk);
534 goto out;
535 case ICMP_SOURCE_QUENCH:
536 /* Just silently ignore these. */
537 goto out;
538 case ICMP_PARAMETERPROB:
539 err = EPROTO;
540 break;
541 case ICMP_DEST_UNREACH:
542 if (code > NR_ICMP_UNREACH)
543 goto out;
544
545 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
546 /* We are not interested in TCP_LISTEN and open_requests
547 * (SYN-ACKs send out by Linux are always <576bytes so
548 * they should go through unfragmented).
549 */
550 if (sk->sk_state == TCP_LISTEN)
551 goto out;
552
553 WRITE_ONCE(tp->mtu_info, info);
554 if (!sock_owned_by_user(sk)) {
555 tcp_v4_mtu_reduced(sk);
556 } else {
557 if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags))
558 sock_hold(sk);
559 }
560 goto out;
561 }
562
563 err = icmp_err_convert[code].errno;
564 /* check if this ICMP message allows revert of backoff.
565 * (see RFC 6069)
566 */
567 if (!fastopen &&
568 (code == ICMP_NET_UNREACH || code == ICMP_HOST_UNREACH))
569 tcp_ld_RTO_revert(sk, seq);
570 break;
571 case ICMP_TIME_EXCEEDED:
572 err = EHOSTUNREACH;
573 break;
574 default:
575 goto out;
576 }
577
578 switch (sk->sk_state) {
579 case TCP_SYN_SENT:
580 case TCP_SYN_RECV:
581 /* Only in fast or simultaneous open. If a fast open socket is
582 * already accepted it is treated as a connected one below.
583 */
584 if (fastopen && !fastopen->sk)
585 break;
586
587 ip_icmp_error(sk, skb, err, th->dest, info, (u8 *)th);
588
589 if (!sock_owned_by_user(sk)) {
590 sk->sk_err = err;
591
592 sk_error_report(sk);
593
594 tcp_done(sk);
595 } else {
596 sk->sk_err_soft = err;
597 }
598 goto out;
599 }
600
601 /* If we've already connected we will keep trying
602 * until we time out, or the user gives up.
603 *
604 * rfc1122 4.2.3.9 allows to consider as hard errors
605 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
606 * but it is obsoleted by pmtu discovery).
607 *
608 * Note, that in modern internet, where routing is unreliable
609 * and in each dark corner broken firewalls sit, sending random
610 * errors ordered by their masters even this two messages finally lose
611 * their original sense (even Linux sends invalid PORT_UNREACHs)
612 *
613 * Now we are in compliance with RFCs.
614 * --ANK (980905)
615 */
616
617 inet = inet_sk(sk);
618 if (!sock_owned_by_user(sk) && inet->recverr) {
619 sk->sk_err = err;
620 sk_error_report(sk);
621 } else { /* Only an error on timeout */
622 sk->sk_err_soft = err;
623 }
624
625 out:
626 bh_unlock_sock(sk);
627 sock_put(sk);
628 return 0;
629 }
630
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)631 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr)
632 {
633 struct tcphdr *th = tcp_hdr(skb);
634
635 th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
636 skb->csum_start = skb_transport_header(skb) - skb->head;
637 skb->csum_offset = offsetof(struct tcphdr, check);
638 }
639
640 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)641 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
642 {
643 const struct inet_sock *inet = inet_sk(sk);
644
645 __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
646 }
647 EXPORT_SYMBOL(tcp_v4_send_check);
648
649 /*
650 * This routine will send an RST to the other tcp.
651 *
652 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
653 * for reset.
654 * Answer: if a packet caused RST, it is not for a socket
655 * existing in our system, if it is matched to a socket,
656 * it is just duplicate segment or bug in other side's TCP.
657 * So that we build reply only basing on parameters
658 * arrived with segment.
659 * Exception: precedence violation. We do not implement it in any case.
660 */
661
662 #ifdef CONFIG_TCP_MD5SIG
663 #define OPTION_BYTES TCPOLEN_MD5SIG_ALIGNED
664 #else
665 #define OPTION_BYTES sizeof(__be32)
666 #endif
667
tcp_v4_send_reset(const struct sock * sk,struct sk_buff * skb)668 static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
669 {
670 const struct tcphdr *th = tcp_hdr(skb);
671 struct {
672 struct tcphdr th;
673 __be32 opt[OPTION_BYTES / sizeof(__be32)];
674 } rep;
675 struct ip_reply_arg arg;
676 #ifdef CONFIG_TCP_MD5SIG
677 struct tcp_md5sig_key *key = NULL;
678 const __u8 *hash_location = NULL;
679 unsigned char newhash[16];
680 int genhash;
681 struct sock *sk1 = NULL;
682 #endif
683 u64 transmit_time = 0;
684 struct sock *ctl_sk;
685 struct net *net;
686
687 /* Never send a reset in response to a reset. */
688 if (th->rst)
689 return;
690
691 /* If sk not NULL, it means we did a successful lookup and incoming
692 * route had to be correct. prequeue might have dropped our dst.
693 */
694 if (!sk && skb_rtable(skb)->rt_type != RTN_LOCAL)
695 return;
696
697 /* Swap the send and the receive. */
698 memset(&rep, 0, sizeof(rep));
699 rep.th.dest = th->source;
700 rep.th.source = th->dest;
701 rep.th.doff = sizeof(struct tcphdr) / 4;
702 rep.th.rst = 1;
703
704 if (th->ack) {
705 rep.th.seq = th->ack_seq;
706 } else {
707 rep.th.ack = 1;
708 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
709 skb->len - (th->doff << 2));
710 }
711
712 memset(&arg, 0, sizeof(arg));
713 arg.iov[0].iov_base = (unsigned char *)&rep;
714 arg.iov[0].iov_len = sizeof(rep.th);
715
716 net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
717 #ifdef CONFIG_TCP_MD5SIG
718 rcu_read_lock();
719 hash_location = tcp_parse_md5sig_option(th);
720 if (sk && sk_fullsock(sk)) {
721 const union tcp_md5_addr *addr;
722 int l3index;
723
724 /* sdif set, means packet ingressed via a device
725 * in an L3 domain and inet_iif is set to it.
726 */
727 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
728 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
729 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
730 } else if (hash_location) {
731 const union tcp_md5_addr *addr;
732 int sdif = tcp_v4_sdif(skb);
733 int dif = inet_iif(skb);
734 int l3index;
735
736 /*
737 * active side is lost. Try to find listening socket through
738 * source port, and then find md5 key through listening socket.
739 * we are not loose security here:
740 * Incoming packet is checked with md5 hash with finding key,
741 * no RST generated if md5 hash doesn't match.
742 */
743 sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
744 ip_hdr(skb)->saddr,
745 th->source, ip_hdr(skb)->daddr,
746 ntohs(th->source), dif, sdif);
747 /* don't send rst if it can't find key */
748 if (!sk1)
749 goto out;
750
751 /* sdif set, means packet ingressed via a device
752 * in an L3 domain and dif is set to it.
753 */
754 l3index = sdif ? dif : 0;
755 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
756 key = tcp_md5_do_lookup(sk1, l3index, addr, AF_INET);
757 if (!key)
758 goto out;
759
760
761 genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
762 if (genhash || memcmp(hash_location, newhash, 16) != 0)
763 goto out;
764
765 }
766
767 if (key) {
768 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
769 (TCPOPT_NOP << 16) |
770 (TCPOPT_MD5SIG << 8) |
771 TCPOLEN_MD5SIG);
772 /* Update length and the length the header thinks exists */
773 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
774 rep.th.doff = arg.iov[0].iov_len / 4;
775
776 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
777 key, ip_hdr(skb)->saddr,
778 ip_hdr(skb)->daddr, &rep.th);
779 }
780 #endif
781 /* Can't co-exist with TCPMD5, hence check rep.opt[0] */
782 if (rep.opt[0] == 0) {
783 __be32 mrst = mptcp_reset_option(skb);
784
785 if (mrst) {
786 rep.opt[0] = mrst;
787 arg.iov[0].iov_len += sizeof(mrst);
788 rep.th.doff = arg.iov[0].iov_len / 4;
789 }
790 }
791
792 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
793 ip_hdr(skb)->saddr, /* XXX */
794 arg.iov[0].iov_len, IPPROTO_TCP, 0);
795 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
796 arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
797
798 /* When socket is gone, all binding information is lost.
799 * routing might fail in this case. No choice here, if we choose to force
800 * input interface, we will misroute in case of asymmetric route.
801 */
802 if (sk) {
803 arg.bound_dev_if = sk->sk_bound_dev_if;
804 if (sk_fullsock(sk))
805 trace_tcp_send_reset(sk, skb);
806 }
807
808 BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
809 offsetof(struct inet_timewait_sock, tw_bound_dev_if));
810
811 arg.tos = ip_hdr(skb)->tos;
812 arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
813 local_bh_disable();
814 ctl_sk = this_cpu_read(ipv4_tcp_sk);
815 sock_net_set(ctl_sk, net);
816 if (sk) {
817 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
818 inet_twsk(sk)->tw_mark : sk->sk_mark;
819 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
820 inet_twsk(sk)->tw_priority : sk->sk_priority;
821 transmit_time = tcp_transmit_time(sk);
822 }
823 ip_send_unicast_reply(ctl_sk,
824 skb, &TCP_SKB_CB(skb)->header.h4.opt,
825 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
826 &arg, arg.iov[0].iov_len,
827 transmit_time);
828
829 ctl_sk->sk_mark = 0;
830 sock_net_set(ctl_sk, &init_net);
831 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
832 __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
833 local_bh_enable();
834
835 #ifdef CONFIG_TCP_MD5SIG
836 out:
837 rcu_read_unlock();
838 #endif
839 }
840
841 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
842 outside socket context is ugly, certainly. What can I do?
843 */
844
tcp_v4_send_ack(const struct sock * sk,struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 tsval,u32 tsecr,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)845 static void tcp_v4_send_ack(const struct sock *sk,
846 struct sk_buff *skb, u32 seq, u32 ack,
847 u32 win, u32 tsval, u32 tsecr, int oif,
848 struct tcp_md5sig_key *key,
849 int reply_flags, u8 tos)
850 {
851 const struct tcphdr *th = tcp_hdr(skb);
852 struct {
853 struct tcphdr th;
854 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
855 #ifdef CONFIG_TCP_MD5SIG
856 + (TCPOLEN_MD5SIG_ALIGNED >> 2)
857 #endif
858 ];
859 } rep;
860 struct net *net = sock_net(sk);
861 struct ip_reply_arg arg;
862 struct sock *ctl_sk;
863 u64 transmit_time;
864
865 memset(&rep.th, 0, sizeof(struct tcphdr));
866 memset(&arg, 0, sizeof(arg));
867
868 arg.iov[0].iov_base = (unsigned char *)&rep;
869 arg.iov[0].iov_len = sizeof(rep.th);
870 if (tsecr) {
871 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
872 (TCPOPT_TIMESTAMP << 8) |
873 TCPOLEN_TIMESTAMP);
874 rep.opt[1] = htonl(tsval);
875 rep.opt[2] = htonl(tsecr);
876 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
877 }
878
879 /* Swap the send and the receive. */
880 rep.th.dest = th->source;
881 rep.th.source = th->dest;
882 rep.th.doff = arg.iov[0].iov_len / 4;
883 rep.th.seq = htonl(seq);
884 rep.th.ack_seq = htonl(ack);
885 rep.th.ack = 1;
886 rep.th.window = htons(win);
887
888 #ifdef CONFIG_TCP_MD5SIG
889 if (key) {
890 int offset = (tsecr) ? 3 : 0;
891
892 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
893 (TCPOPT_NOP << 16) |
894 (TCPOPT_MD5SIG << 8) |
895 TCPOLEN_MD5SIG);
896 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
897 rep.th.doff = arg.iov[0].iov_len/4;
898
899 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
900 key, ip_hdr(skb)->saddr,
901 ip_hdr(skb)->daddr, &rep.th);
902 }
903 #endif
904 arg.flags = reply_flags;
905 arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
906 ip_hdr(skb)->saddr, /* XXX */
907 arg.iov[0].iov_len, IPPROTO_TCP, 0);
908 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
909 if (oif)
910 arg.bound_dev_if = oif;
911 arg.tos = tos;
912 arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
913 local_bh_disable();
914 ctl_sk = this_cpu_read(ipv4_tcp_sk);
915 sock_net_set(ctl_sk, net);
916 ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
917 inet_twsk(sk)->tw_mark : sk->sk_mark;
918 ctl_sk->sk_priority = (sk->sk_state == TCP_TIME_WAIT) ?
919 inet_twsk(sk)->tw_priority : sk->sk_priority;
920 transmit_time = tcp_transmit_time(sk);
921 ip_send_unicast_reply(ctl_sk,
922 skb, &TCP_SKB_CB(skb)->header.h4.opt,
923 ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
924 &arg, arg.iov[0].iov_len,
925 transmit_time);
926
927 ctl_sk->sk_mark = 0;
928 sock_net_set(ctl_sk, &init_net);
929 __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
930 local_bh_enable();
931 }
932
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)933 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
934 {
935 struct inet_timewait_sock *tw = inet_twsk(sk);
936 struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
937
938 tcp_v4_send_ack(sk, skb,
939 tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
940 tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
941 tcp_time_stamp_raw() + tcptw->tw_ts_offset,
942 tcptw->tw_ts_recent,
943 tw->tw_bound_dev_if,
944 tcp_twsk_md5_key(tcptw),
945 tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
946 tw->tw_tos
947 );
948
949 inet_twsk_put(tw);
950 }
951
tcp_v4_reqsk_send_ack(const struct sock * sk,struct sk_buff * skb,struct request_sock * req)952 static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
953 struct request_sock *req)
954 {
955 const union tcp_md5_addr *addr;
956 int l3index;
957
958 /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
959 * sk->sk_state == TCP_SYN_RECV -> for Fast Open.
960 */
961 u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
962 tcp_sk(sk)->snd_nxt;
963
964 /* RFC 7323 2.3
965 * The window field (SEG.WND) of every outgoing segment, with the
966 * exception of <SYN> segments, MUST be right-shifted by
967 * Rcv.Wind.Shift bits:
968 */
969 addr = (union tcp_md5_addr *)&ip_hdr(skb)->saddr;
970 l3index = tcp_v4_sdif(skb) ? inet_iif(skb) : 0;
971 tcp_v4_send_ack(sk, skb, seq,
972 tcp_rsk(req)->rcv_nxt,
973 req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
974 tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
975 req->ts_recent,
976 0,
977 tcp_md5_do_lookup(sk, l3index, addr, AF_INET),
978 inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
979 ip_hdr(skb)->tos);
980 }
981
982 /*
983 * Send a SYN-ACK after having received a SYN.
984 * This still operates on a request_sock only, not on a big
985 * socket.
986 */
tcp_v4_send_synack(const struct sock * sk,struct dst_entry * dst,struct flowi * fl,struct request_sock * req,struct tcp_fastopen_cookie * foc,enum tcp_synack_type synack_type,struct sk_buff * syn_skb)987 static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
988 struct flowi *fl,
989 struct request_sock *req,
990 struct tcp_fastopen_cookie *foc,
991 enum tcp_synack_type synack_type,
992 struct sk_buff *syn_skb)
993 {
994 const struct inet_request_sock *ireq = inet_rsk(req);
995 struct flowi4 fl4;
996 int err = -1;
997 struct sk_buff *skb;
998 u8 tos;
999
1000 /* First, grab a route. */
1001 if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
1002 return -1;
1003
1004 skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
1005
1006 if (skb) {
1007 __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
1008
1009 tos = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ?
1010 (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) |
1011 (inet_sk(sk)->tos & INET_ECN_MASK) :
1012 inet_sk(sk)->tos;
1013
1014 if (!INET_ECN_is_capable(tos) &&
1015 tcp_bpf_ca_needs_ecn((struct sock *)req))
1016 tos |= INET_ECN_ECT_0;
1017
1018 rcu_read_lock();
1019 err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
1020 ireq->ir_rmt_addr,
1021 rcu_dereference(ireq->ireq_opt),
1022 tos);
1023 rcu_read_unlock();
1024 err = net_xmit_eval(err);
1025 }
1026
1027 return err;
1028 }
1029
1030 /*
1031 * IPv4 request_sock destructor.
1032 */
tcp_v4_reqsk_destructor(struct request_sock * req)1033 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1034 {
1035 kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
1036 }
1037
1038 #ifdef CONFIG_TCP_MD5SIG
1039 /*
1040 * RFC2385 MD5 checksumming requires a mapping of
1041 * IP address->MD5 Key.
1042 * We need to maintain these in the sk structure.
1043 */
1044
1045 DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
1046 EXPORT_SYMBOL(tcp_md5_needed);
1047
better_md5_match(struct tcp_md5sig_key * old,struct tcp_md5sig_key * new)1048 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
1049 {
1050 if (!old)
1051 return true;
1052
1053 /* l3index always overrides non-l3index */
1054 if (old->l3index && new->l3index == 0)
1055 return false;
1056 if (old->l3index == 0 && new->l3index)
1057 return true;
1058
1059 return old->prefixlen < new->prefixlen;
1060 }
1061
1062 /* Find the Key structure for an address. */
__tcp_md5_do_lookup(const struct sock * sk,int l3index,const union tcp_md5_addr * addr,int family)1063 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
1064 const union tcp_md5_addr *addr,
1065 int family)
1066 {
1067 const struct tcp_sock *tp = tcp_sk(sk);
1068 struct tcp_md5sig_key *key;
1069 const struct tcp_md5sig_info *md5sig;
1070 __be32 mask;
1071 struct tcp_md5sig_key *best_match = NULL;
1072 bool match;
1073
1074 /* caller either holds rcu_read_lock() or socket lock */
1075 md5sig = rcu_dereference_check(tp->md5sig_info,
1076 lockdep_sock_is_held(sk));
1077 if (!md5sig)
1078 return NULL;
1079
1080 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1081 lockdep_sock_is_held(sk)) {
1082 if (key->family != family)
1083 continue;
1084 if (key->flags & TCP_MD5SIG_FLAG_IFINDEX && key->l3index != l3index)
1085 continue;
1086 if (family == AF_INET) {
1087 mask = inet_make_mask(key->prefixlen);
1088 match = (key->addr.a4.s_addr & mask) ==
1089 (addr->a4.s_addr & mask);
1090 #if IS_ENABLED(CONFIG_IPV6)
1091 } else if (family == AF_INET6) {
1092 match = ipv6_prefix_equal(&key->addr.a6, &addr->a6,
1093 key->prefixlen);
1094 #endif
1095 } else {
1096 match = false;
1097 }
1098
1099 if (match && better_md5_match(best_match, key))
1100 best_match = key;
1101 }
1102 return best_match;
1103 }
1104 EXPORT_SYMBOL(__tcp_md5_do_lookup);
1105
tcp_md5_do_lookup_exact(const struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1106 static struct tcp_md5sig_key *tcp_md5_do_lookup_exact(const struct sock *sk,
1107 const union tcp_md5_addr *addr,
1108 int family, u8 prefixlen,
1109 int l3index, u8 flags)
1110 {
1111 const struct tcp_sock *tp = tcp_sk(sk);
1112 struct tcp_md5sig_key *key;
1113 unsigned int size = sizeof(struct in_addr);
1114 const struct tcp_md5sig_info *md5sig;
1115
1116 /* caller either holds rcu_read_lock() or socket lock */
1117 md5sig = rcu_dereference_check(tp->md5sig_info,
1118 lockdep_sock_is_held(sk));
1119 if (!md5sig)
1120 return NULL;
1121 #if IS_ENABLED(CONFIG_IPV6)
1122 if (family == AF_INET6)
1123 size = sizeof(struct in6_addr);
1124 #endif
1125 hlist_for_each_entry_rcu(key, &md5sig->head, node,
1126 lockdep_sock_is_held(sk)) {
1127 if (key->family != family)
1128 continue;
1129 if ((key->flags & TCP_MD5SIG_FLAG_IFINDEX) != (flags & TCP_MD5SIG_FLAG_IFINDEX))
1130 continue;
1131 if (key->l3index != l3index)
1132 continue;
1133 if (!memcmp(&key->addr, addr, size) &&
1134 key->prefixlen == prefixlen)
1135 return key;
1136 }
1137 return NULL;
1138 }
1139
tcp_v4_md5_lookup(const struct sock * sk,const struct sock * addr_sk)1140 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
1141 const struct sock *addr_sk)
1142 {
1143 const union tcp_md5_addr *addr;
1144 int l3index;
1145
1146 l3index = l3mdev_master_ifindex_by_index(sock_net(sk),
1147 addr_sk->sk_bound_dev_if);
1148 addr = (const union tcp_md5_addr *)&addr_sk->sk_daddr;
1149 return tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1150 }
1151 EXPORT_SYMBOL(tcp_v4_md5_lookup);
1152
1153 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags,const u8 * newkey,u8 newkeylen,gfp_t gfp)1154 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
1155 int family, u8 prefixlen, int l3index, u8 flags,
1156 const u8 *newkey, u8 newkeylen, gfp_t gfp)
1157 {
1158 /* Add Key to the list */
1159 struct tcp_md5sig_key *key;
1160 struct tcp_sock *tp = tcp_sk(sk);
1161 struct tcp_md5sig_info *md5sig;
1162
1163 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1164 if (key) {
1165 /* Pre-existing entry - just update that one.
1166 * Note that the key might be used concurrently.
1167 * data_race() is telling kcsan that we do not care of
1168 * key mismatches, since changing MD5 key on live flows
1169 * can lead to packet drops.
1170 */
1171 data_race(memcpy(key->key, newkey, newkeylen));
1172
1173 /* Pairs with READ_ONCE() in tcp_md5_hash_key().
1174 * Also note that a reader could catch new key->keylen value
1175 * but old key->key[], this is the reason we use __GFP_ZERO
1176 * at sock_kmalloc() time below these lines.
1177 */
1178 WRITE_ONCE(key->keylen, newkeylen);
1179
1180 return 0;
1181 }
1182
1183 md5sig = rcu_dereference_protected(tp->md5sig_info,
1184 lockdep_sock_is_held(sk));
1185 if (!md5sig) {
1186 md5sig = kmalloc(sizeof(*md5sig), gfp);
1187 if (!md5sig)
1188 return -ENOMEM;
1189
1190 sk_gso_disable(sk);
1191 INIT_HLIST_HEAD(&md5sig->head);
1192 rcu_assign_pointer(tp->md5sig_info, md5sig);
1193 }
1194
1195 key = sock_kmalloc(sk, sizeof(*key), gfp | __GFP_ZERO);
1196 if (!key)
1197 return -ENOMEM;
1198 if (!tcp_alloc_md5sig_pool()) {
1199 sock_kfree_s(sk, key, sizeof(*key));
1200 return -ENOMEM;
1201 }
1202
1203 memcpy(key->key, newkey, newkeylen);
1204 key->keylen = newkeylen;
1205 key->family = family;
1206 key->prefixlen = prefixlen;
1207 key->l3index = l3index;
1208 key->flags = flags;
1209 memcpy(&key->addr, addr,
1210 (IS_ENABLED(CONFIG_IPV6) && family == AF_INET6) ? sizeof(struct in6_addr) :
1211 sizeof(struct in_addr));
1212 hlist_add_head_rcu(&key->node, &md5sig->head);
1213 return 0;
1214 }
1215 EXPORT_SYMBOL(tcp_md5_do_add);
1216
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family,u8 prefixlen,int l3index,u8 flags)1217 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
1218 u8 prefixlen, int l3index, u8 flags)
1219 {
1220 struct tcp_md5sig_key *key;
1221
1222 key = tcp_md5_do_lookup_exact(sk, addr, family, prefixlen, l3index, flags);
1223 if (!key)
1224 return -ENOENT;
1225 hlist_del_rcu(&key->node);
1226 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1227 kfree_rcu(key, rcu);
1228 return 0;
1229 }
1230 EXPORT_SYMBOL(tcp_md5_do_del);
1231
tcp_clear_md5_list(struct sock * sk)1232 static void tcp_clear_md5_list(struct sock *sk)
1233 {
1234 struct tcp_sock *tp = tcp_sk(sk);
1235 struct tcp_md5sig_key *key;
1236 struct hlist_node *n;
1237 struct tcp_md5sig_info *md5sig;
1238
1239 md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1240
1241 hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
1242 hlist_del_rcu(&key->node);
1243 atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1244 kfree_rcu(key, rcu);
1245 }
1246 }
1247
tcp_v4_parse_md5_keys(struct sock * sk,int optname,sockptr_t optval,int optlen)1248 static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
1249 sockptr_t optval, int optlen)
1250 {
1251 struct tcp_md5sig cmd;
1252 struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1253 const union tcp_md5_addr *addr;
1254 u8 prefixlen = 32;
1255 int l3index = 0;
1256 u8 flags;
1257
1258 if (optlen < sizeof(cmd))
1259 return -EINVAL;
1260
1261 if (copy_from_sockptr(&cmd, optval, sizeof(cmd)))
1262 return -EFAULT;
1263
1264 if (sin->sin_family != AF_INET)
1265 return -EINVAL;
1266
1267 flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX;
1268
1269 if (optname == TCP_MD5SIG_EXT &&
1270 cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) {
1271 prefixlen = cmd.tcpm_prefixlen;
1272 if (prefixlen > 32)
1273 return -EINVAL;
1274 }
1275
1276 if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex &&
1277 cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) {
1278 struct net_device *dev;
1279
1280 rcu_read_lock();
1281 dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex);
1282 if (dev && netif_is_l3_master(dev))
1283 l3index = dev->ifindex;
1284
1285 rcu_read_unlock();
1286
1287 /* ok to reference set/not set outside of rcu;
1288 * right now device MUST be an L3 master
1289 */
1290 if (!dev || !l3index)
1291 return -EINVAL;
1292 }
1293
1294 addr = (union tcp_md5_addr *)&sin->sin_addr.s_addr;
1295
1296 if (!cmd.tcpm_keylen)
1297 return tcp_md5_do_del(sk, addr, AF_INET, prefixlen, l3index, flags);
1298
1299 if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1300 return -EINVAL;
1301
1302 return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
1303 cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
1304 }
1305
tcp_v4_md5_hash_headers(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,const struct tcphdr * th,int nbytes)1306 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
1307 __be32 daddr, __be32 saddr,
1308 const struct tcphdr *th, int nbytes)
1309 {
1310 struct tcp4_pseudohdr *bp;
1311 struct scatterlist sg;
1312 struct tcphdr *_th;
1313
1314 bp = hp->scratch;
1315 bp->saddr = saddr;
1316 bp->daddr = daddr;
1317 bp->pad = 0;
1318 bp->protocol = IPPROTO_TCP;
1319 bp->len = cpu_to_be16(nbytes);
1320
1321 _th = (struct tcphdr *)(bp + 1);
1322 memcpy(_th, th, sizeof(*th));
1323 _th->check = 0;
1324
1325 sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
1326 ahash_request_set_crypt(hp->md5_req, &sg, NULL,
1327 sizeof(*bp) + sizeof(*th));
1328 return crypto_ahash_update(hp->md5_req);
1329 }
1330
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1331 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1332 __be32 daddr, __be32 saddr, const struct tcphdr *th)
1333 {
1334 struct tcp_md5sig_pool *hp;
1335 struct ahash_request *req;
1336
1337 hp = tcp_get_md5sig_pool();
1338 if (!hp)
1339 goto clear_hash_noput;
1340 req = hp->md5_req;
1341
1342 if (crypto_ahash_init(req))
1343 goto clear_hash;
1344 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
1345 goto clear_hash;
1346 if (tcp_md5_hash_key(hp, key))
1347 goto clear_hash;
1348 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1349 if (crypto_ahash_final(req))
1350 goto clear_hash;
1351
1352 tcp_put_md5sig_pool();
1353 return 0;
1354
1355 clear_hash:
1356 tcp_put_md5sig_pool();
1357 clear_hash_noput:
1358 memset(md5_hash, 0, 16);
1359 return 1;
1360 }
1361
tcp_v4_md5_hash_skb(char * md5_hash,const struct tcp_md5sig_key * key,const struct sock * sk,const struct sk_buff * skb)1362 int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
1363 const struct sock *sk,
1364 const struct sk_buff *skb)
1365 {
1366 struct tcp_md5sig_pool *hp;
1367 struct ahash_request *req;
1368 const struct tcphdr *th = tcp_hdr(skb);
1369 __be32 saddr, daddr;
1370
1371 if (sk) { /* valid for establish/request sockets */
1372 saddr = sk->sk_rcv_saddr;
1373 daddr = sk->sk_daddr;
1374 } else {
1375 const struct iphdr *iph = ip_hdr(skb);
1376 saddr = iph->saddr;
1377 daddr = iph->daddr;
1378 }
1379
1380 hp = tcp_get_md5sig_pool();
1381 if (!hp)
1382 goto clear_hash_noput;
1383 req = hp->md5_req;
1384
1385 if (crypto_ahash_init(req))
1386 goto clear_hash;
1387
1388 if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
1389 goto clear_hash;
1390 if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1391 goto clear_hash;
1392 if (tcp_md5_hash_key(hp, key))
1393 goto clear_hash;
1394 ahash_request_set_crypt(req, NULL, md5_hash, 0);
1395 if (crypto_ahash_final(req))
1396 goto clear_hash;
1397
1398 tcp_put_md5sig_pool();
1399 return 0;
1400
1401 clear_hash:
1402 tcp_put_md5sig_pool();
1403 clear_hash_noput:
1404 memset(md5_hash, 0, 16);
1405 return 1;
1406 }
1407 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1408
1409 #endif
1410
tcp_v4_init_req(struct request_sock * req,const struct sock * sk_listener,struct sk_buff * skb)1411 static void tcp_v4_init_req(struct request_sock *req,
1412 const struct sock *sk_listener,
1413 struct sk_buff *skb)
1414 {
1415 struct inet_request_sock *ireq = inet_rsk(req);
1416 struct net *net = sock_net(sk_listener);
1417
1418 sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
1419 sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
1420 RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(net, skb));
1421 }
1422
tcp_v4_route_req(const struct sock * sk,struct sk_buff * skb,struct flowi * fl,struct request_sock * req)1423 static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
1424 struct sk_buff *skb,
1425 struct flowi *fl,
1426 struct request_sock *req)
1427 {
1428 tcp_v4_init_req(req, sk, skb);
1429
1430 if (security_inet_conn_request(sk, skb, req))
1431 return NULL;
1432
1433 return inet_csk_route_req(sk, &fl->u.ip4, req);
1434 }
1435
1436 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1437 .family = PF_INET,
1438 .obj_size = sizeof(struct tcp_request_sock),
1439 .rtx_syn_ack = tcp_rtx_synack,
1440 .send_ack = tcp_v4_reqsk_send_ack,
1441 .destructor = tcp_v4_reqsk_destructor,
1442 .send_reset = tcp_v4_send_reset,
1443 .syn_ack_timeout = tcp_syn_ack_timeout,
1444 };
1445
1446 const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1447 .mss_clamp = TCP_MSS_DEFAULT,
1448 #ifdef CONFIG_TCP_MD5SIG
1449 .req_md5_lookup = tcp_v4_md5_lookup,
1450 .calc_md5_hash = tcp_v4_md5_hash_skb,
1451 #endif
1452 #ifdef CONFIG_SYN_COOKIES
1453 .cookie_init_seq = cookie_v4_init_sequence,
1454 #endif
1455 .route_req = tcp_v4_route_req,
1456 .init_seq = tcp_v4_init_seq,
1457 .init_ts_off = tcp_v4_init_ts_off,
1458 .send_synack = tcp_v4_send_synack,
1459 };
1460
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1461 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1462 {
1463 /* Never answer to SYNs send to broadcast or multicast */
1464 if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1465 goto drop;
1466
1467 return tcp_conn_request(&tcp_request_sock_ops,
1468 &tcp_request_sock_ipv4_ops, sk, skb);
1469
1470 drop:
1471 tcp_listendrop(sk);
1472 return 0;
1473 }
1474 EXPORT_SYMBOL(tcp_v4_conn_request);
1475
1476
1477 /*
1478 * The three way handshake has completed - we got a valid synack -
1479 * now create the new socket.
1480 */
tcp_v4_syn_recv_sock(const struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst,struct request_sock * req_unhash,bool * own_req)1481 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
1482 struct request_sock *req,
1483 struct dst_entry *dst,
1484 struct request_sock *req_unhash,
1485 bool *own_req)
1486 {
1487 struct inet_request_sock *ireq;
1488 bool found_dup_sk = false;
1489 struct inet_sock *newinet;
1490 struct tcp_sock *newtp;
1491 struct sock *newsk;
1492 #ifdef CONFIG_TCP_MD5SIG
1493 const union tcp_md5_addr *addr;
1494 struct tcp_md5sig_key *key;
1495 int l3index;
1496 #endif
1497 struct ip_options_rcu *inet_opt;
1498
1499 if (sk_acceptq_is_full(sk))
1500 goto exit_overflow;
1501
1502 newsk = tcp_create_openreq_child(sk, req, skb);
1503 if (!newsk)
1504 goto exit_nonewsk;
1505
1506 newsk->sk_gso_type = SKB_GSO_TCPV4;
1507 inet_sk_rx_dst_set(newsk, skb);
1508
1509 newtp = tcp_sk(newsk);
1510 newinet = inet_sk(newsk);
1511 ireq = inet_rsk(req);
1512 sk_daddr_set(newsk, ireq->ir_rmt_addr);
1513 sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
1514 newsk->sk_bound_dev_if = ireq->ir_iif;
1515 newinet->inet_saddr = ireq->ir_loc_addr;
1516 inet_opt = rcu_dereference(ireq->ireq_opt);
1517 RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
1518 newinet->mc_index = inet_iif(skb);
1519 newinet->mc_ttl = ip_hdr(skb)->ttl;
1520 newinet->rcv_tos = ip_hdr(skb)->tos;
1521 inet_csk(newsk)->icsk_ext_hdr_len = 0;
1522 if (inet_opt)
1523 inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1524 newinet->inet_id = prandom_u32();
1525
1526 /* Set ToS of the new socket based upon the value of incoming SYN.
1527 * ECT bits are set later in tcp_init_transfer().
1528 */
1529 if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos))
1530 newinet->tos = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK;
1531
1532 if (!dst) {
1533 dst = inet_csk_route_child_sock(sk, newsk, req);
1534 if (!dst)
1535 goto put_and_exit;
1536 } else {
1537 /* syncookie case : see end of cookie_v4_check() */
1538 }
1539 sk_setup_caps(newsk, dst);
1540
1541 tcp_ca_openreq_child(newsk, dst);
1542
1543 tcp_sync_mss(newsk, dst_mtu(dst));
1544 newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
1545
1546 tcp_initialize_rcv_mss(newsk);
1547
1548 #ifdef CONFIG_TCP_MD5SIG
1549 l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif);
1550 /* Copy over the MD5 key from the original socket */
1551 addr = (union tcp_md5_addr *)&newinet->inet_daddr;
1552 key = tcp_md5_do_lookup(sk, l3index, addr, AF_INET);
1553 if (key) {
1554 /*
1555 * We're using one, so create a matching key
1556 * on the newsk structure. If we fail to get
1557 * memory, then we end up not copying the key
1558 * across. Shucks.
1559 */
1560 tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
1561 key->key, key->keylen, GFP_ATOMIC);
1562 sk_gso_disable(newsk);
1563 }
1564 #endif
1565
1566 if (__inet_inherit_port(sk, newsk) < 0)
1567 goto put_and_exit;
1568 *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash),
1569 &found_dup_sk);
1570 if (likely(*own_req)) {
1571 tcp_move_syn(newtp, req);
1572 ireq->ireq_opt = NULL;
1573 } else {
1574 newinet->inet_opt = NULL;
1575
1576 if (!req_unhash && found_dup_sk) {
1577 /* This code path should only be executed in the
1578 * syncookie case only
1579 */
1580 bh_unlock_sock(newsk);
1581 sock_put(newsk);
1582 newsk = NULL;
1583 }
1584 }
1585 return newsk;
1586
1587 exit_overflow:
1588 NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1589 exit_nonewsk:
1590 dst_release(dst);
1591 exit:
1592 tcp_listendrop(sk);
1593 return NULL;
1594 put_and_exit:
1595 newinet->inet_opt = NULL;
1596 inet_csk_prepare_forced_close(newsk);
1597 tcp_done(newsk);
1598 goto exit;
1599 }
1600 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1601
tcp_v4_cookie_check(struct sock * sk,struct sk_buff * skb)1602 static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
1603 {
1604 #ifdef CONFIG_SYN_COOKIES
1605 const struct tcphdr *th = tcp_hdr(skb);
1606
1607 if (!th->syn)
1608 sk = cookie_v4_check(sk, skb);
1609 #endif
1610 return sk;
1611 }
1612
tcp_v4_get_syncookie(struct sock * sk,struct iphdr * iph,struct tcphdr * th,u32 * cookie)1613 u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
1614 struct tcphdr *th, u32 *cookie)
1615 {
1616 u16 mss = 0;
1617 #ifdef CONFIG_SYN_COOKIES
1618 mss = tcp_get_syncookie_mss(&tcp_request_sock_ops,
1619 &tcp_request_sock_ipv4_ops, sk, th);
1620 if (mss) {
1621 *cookie = __cookie_v4_init_sequence(iph, th, &mss);
1622 tcp_synq_overflow(sk);
1623 }
1624 #endif
1625 return mss;
1626 }
1627
1628 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
1629 u32));
1630 /* The socket must have it's spinlock held when we get
1631 * here, unless it is a TCP_LISTEN socket.
1632 *
1633 * We have a potential double-lock case here, so even when
1634 * doing backlog processing we use the BH locking scheme.
1635 * This is because we cannot sleep with the original spinlock
1636 * held.
1637 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1638 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1639 {
1640 enum skb_drop_reason reason;
1641 struct sock *rsk;
1642
1643 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1644 struct dst_entry *dst;
1645
1646 dst = rcu_dereference_protected(sk->sk_rx_dst,
1647 lockdep_sock_is_held(sk));
1648
1649 sock_rps_save_rxhash(sk, skb);
1650 sk_mark_napi_id(sk, skb);
1651 if (dst) {
1652 if (sk->sk_rx_dst_ifindex != skb->skb_iif ||
1653 !INDIRECT_CALL_1(dst->ops->check, ipv4_dst_check,
1654 dst, 0)) {
1655 RCU_INIT_POINTER(sk->sk_rx_dst, NULL);
1656 dst_release(dst);
1657 }
1658 }
1659 tcp_rcv_established(sk, skb);
1660 return 0;
1661 }
1662
1663 reason = SKB_DROP_REASON_NOT_SPECIFIED;
1664 if (tcp_checksum_complete(skb))
1665 goto csum_err;
1666
1667 if (sk->sk_state == TCP_LISTEN) {
1668 struct sock *nsk = tcp_v4_cookie_check(sk, skb);
1669
1670 if (!nsk)
1671 goto discard;
1672 if (nsk != sk) {
1673 if (tcp_child_process(sk, nsk, skb)) {
1674 rsk = nsk;
1675 goto reset;
1676 }
1677 return 0;
1678 }
1679 } else
1680 sock_rps_save_rxhash(sk, skb);
1681
1682 if (tcp_rcv_state_process(sk, skb)) {
1683 rsk = sk;
1684 goto reset;
1685 }
1686 return 0;
1687
1688 reset:
1689 tcp_v4_send_reset(rsk, skb);
1690 discard:
1691 kfree_skb_reason(skb, reason);
1692 /* Be careful here. If this function gets more complicated and
1693 * gcc suffers from register pressure on the x86, sk (in %ebx)
1694 * might be destroyed here. This current version compiles correctly,
1695 * but you have been warned.
1696 */
1697 return 0;
1698
1699 csum_err:
1700 reason = SKB_DROP_REASON_TCP_CSUM;
1701 trace_tcp_bad_csum(skb);
1702 TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1703 TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1704 goto discard;
1705 }
1706 EXPORT_SYMBOL(tcp_v4_do_rcv);
1707
tcp_v4_early_demux(struct sk_buff * skb)1708 int tcp_v4_early_demux(struct sk_buff *skb)
1709 {
1710 const struct iphdr *iph;
1711 const struct tcphdr *th;
1712 struct sock *sk;
1713
1714 if (skb->pkt_type != PACKET_HOST)
1715 return 0;
1716
1717 if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr)))
1718 return 0;
1719
1720 iph = ip_hdr(skb);
1721 th = tcp_hdr(skb);
1722
1723 if (th->doff < sizeof(struct tcphdr) / 4)
1724 return 0;
1725
1726 sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo,
1727 iph->saddr, th->source,
1728 iph->daddr, ntohs(th->dest),
1729 skb->skb_iif, inet_sdif(skb));
1730 if (sk) {
1731 skb->sk = sk;
1732 skb->destructor = sock_edemux;
1733 if (sk_fullsock(sk)) {
1734 struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
1735
1736 if (dst)
1737 dst = dst_check(dst, 0);
1738 if (dst &&
1739 sk->sk_rx_dst_ifindex == skb->skb_iif)
1740 skb_dst_set_noref(skb, dst);
1741 }
1742 }
1743 return 0;
1744 }
1745
tcp_add_backlog(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)1746 bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
1747 enum skb_drop_reason *reason)
1748 {
1749 u32 limit, tail_gso_size, tail_gso_segs;
1750 struct skb_shared_info *shinfo;
1751 const struct tcphdr *th;
1752 struct tcphdr *thtail;
1753 struct sk_buff *tail;
1754 unsigned int hdrlen;
1755 bool fragstolen;
1756 u32 gso_segs;
1757 u32 gso_size;
1758 int delta;
1759
1760 /* In case all data was pulled from skb frags (in __pskb_pull_tail()),
1761 * we can fix skb->truesize to its real value to avoid future drops.
1762 * This is valid because skb is not yet charged to the socket.
1763 * It has been noticed pure SACK packets were sometimes dropped
1764 * (if cooked by drivers without copybreak feature).
1765 */
1766 skb_condense(skb);
1767
1768 skb_dst_drop(skb);
1769
1770 if (unlikely(tcp_checksum_complete(skb))) {
1771 bh_unlock_sock(sk);
1772 trace_tcp_bad_csum(skb);
1773 *reason = SKB_DROP_REASON_TCP_CSUM;
1774 __TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
1775 __TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
1776 return true;
1777 }
1778
1779 /* Attempt coalescing to last skb in backlog, even if we are
1780 * above the limits.
1781 * This is okay because skb capacity is limited to MAX_SKB_FRAGS.
1782 */
1783 th = (const struct tcphdr *)skb->data;
1784 hdrlen = th->doff * 4;
1785
1786 tail = sk->sk_backlog.tail;
1787 if (!tail)
1788 goto no_coalesce;
1789 thtail = (struct tcphdr *)tail->data;
1790
1791 if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
1792 TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
1793 ((TCP_SKB_CB(tail)->tcp_flags |
1794 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_SYN | TCPHDR_RST | TCPHDR_URG)) ||
1795 !((TCP_SKB_CB(tail)->tcp_flags &
1796 TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_ACK) ||
1797 ((TCP_SKB_CB(tail)->tcp_flags ^
1798 TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
1799 #ifdef CONFIG_TLS_DEVICE
1800 tail->decrypted != skb->decrypted ||
1801 #endif
1802 thtail->doff != th->doff ||
1803 memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
1804 goto no_coalesce;
1805
1806 __skb_pull(skb, hdrlen);
1807
1808 shinfo = skb_shinfo(skb);
1809 gso_size = shinfo->gso_size ?: skb->len;
1810 gso_segs = shinfo->gso_segs ?: 1;
1811
1812 shinfo = skb_shinfo(tail);
1813 tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen);
1814 tail_gso_segs = shinfo->gso_segs ?: 1;
1815
1816 if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
1817 TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
1818
1819 if (likely(!before(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))) {
1820 TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
1821 thtail->window = th->window;
1822 }
1823
1824 /* We have to update both TCP_SKB_CB(tail)->tcp_flags and
1825 * thtail->fin, so that the fast path in tcp_rcv_established()
1826 * is not entered if we append a packet with a FIN.
1827 * SYN, RST, URG are not present.
1828 * ACK is set on both packets.
1829 * PSH : we do not really care in TCP stack,
1830 * at least for 'GRO' packets.
1831 */
1832 thtail->fin |= th->fin;
1833 TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
1834
1835 if (TCP_SKB_CB(skb)->has_rxtstamp) {
1836 TCP_SKB_CB(tail)->has_rxtstamp = true;
1837 tail->tstamp = skb->tstamp;
1838 skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
1839 }
1840
1841 /* Not as strict as GRO. We only need to carry mss max value */
1842 shinfo->gso_size = max(gso_size, tail_gso_size);
1843 shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF);
1844
1845 sk->sk_backlog.len += delta;
1846 __NET_INC_STATS(sock_net(sk),
1847 LINUX_MIB_TCPBACKLOGCOALESCE);
1848 kfree_skb_partial(skb, fragstolen);
1849 return false;
1850 }
1851 __skb_push(skb, hdrlen);
1852
1853 no_coalesce:
1854 /* Only socket owner can try to collapse/prune rx queues
1855 * to reduce memory overhead, so add a little headroom here.
1856 * Few sockets backlog are possibly concurrently non empty.
1857 */
1858 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024;
1859
1860 if (unlikely(sk_add_backlog(sk, skb, limit))) {
1861 bh_unlock_sock(sk);
1862 *reason = SKB_DROP_REASON_SOCKET_BACKLOG;
1863 __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
1864 return true;
1865 }
1866 return false;
1867 }
1868 EXPORT_SYMBOL(tcp_add_backlog);
1869
tcp_filter(struct sock * sk,struct sk_buff * skb)1870 int tcp_filter(struct sock *sk, struct sk_buff *skb)
1871 {
1872 struct tcphdr *th = (struct tcphdr *)skb->data;
1873
1874 return sk_filter_trim_cap(sk, skb, th->doff * 4);
1875 }
1876 EXPORT_SYMBOL(tcp_filter);
1877
tcp_v4_restore_cb(struct sk_buff * skb)1878 static void tcp_v4_restore_cb(struct sk_buff *skb)
1879 {
1880 memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
1881 sizeof(struct inet_skb_parm));
1882 }
1883
tcp_v4_fill_cb(struct sk_buff * skb,const struct iphdr * iph,const struct tcphdr * th)1884 static void tcp_v4_fill_cb(struct sk_buff *skb, const struct iphdr *iph,
1885 const struct tcphdr *th)
1886 {
1887 /* This is tricky : We move IPCB at its correct location into TCP_SKB_CB()
1888 * barrier() makes sure compiler wont play fool^Waliasing games.
1889 */
1890 memmove(&TCP_SKB_CB(skb)->header.h4, IPCB(skb),
1891 sizeof(struct inet_skb_parm));
1892 barrier();
1893
1894 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1895 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1896 skb->len - th->doff * 4);
1897 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1898 TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th);
1899 TCP_SKB_CB(skb)->tcp_tw_isn = 0;
1900 TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1901 TCP_SKB_CB(skb)->sacked = 0;
1902 TCP_SKB_CB(skb)->has_rxtstamp =
1903 skb->tstamp || skb_hwtstamps(skb)->hwtstamp;
1904 }
1905
1906 /*
1907 * From tcp_input.c
1908 */
1909
tcp_v4_rcv(struct sk_buff * skb)1910 int tcp_v4_rcv(struct sk_buff *skb)
1911 {
1912 struct net *net = dev_net(skb->dev);
1913 enum skb_drop_reason drop_reason;
1914 int sdif = inet_sdif(skb);
1915 int dif = inet_iif(skb);
1916 const struct iphdr *iph;
1917 const struct tcphdr *th;
1918 bool refcounted;
1919 struct sock *sk;
1920 int ret;
1921
1922 drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
1923 if (skb->pkt_type != PACKET_HOST)
1924 goto discard_it;
1925
1926 /* Count it even if it's bad */
1927 __TCP_INC_STATS(net, TCP_MIB_INSEGS);
1928
1929 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1930 goto discard_it;
1931
1932 th = (const struct tcphdr *)skb->data;
1933
1934 if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) {
1935 drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
1936 goto bad_packet;
1937 }
1938 if (!pskb_may_pull(skb, th->doff * 4))
1939 goto discard_it;
1940
1941 /* An explanation is required here, I think.
1942 * Packet length and doff are validated by header prediction,
1943 * provided case of th->doff==0 is eliminated.
1944 * So, we defer the checks. */
1945
1946 if (skb_checksum_init(skb, IPPROTO_TCP, inet_compute_pseudo))
1947 goto csum_error;
1948
1949 th = (const struct tcphdr *)skb->data;
1950 iph = ip_hdr(skb);
1951 lookup:
1952 sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
1953 th->dest, sdif, &refcounted);
1954 if (!sk)
1955 goto no_tcp_socket;
1956
1957 process:
1958 if (sk->sk_state == TCP_TIME_WAIT)
1959 goto do_time_wait;
1960
1961 if (sk->sk_state == TCP_NEW_SYN_RECV) {
1962 struct request_sock *req = inet_reqsk(sk);
1963 bool req_stolen = false;
1964 struct sock *nsk;
1965
1966 sk = req->rsk_listener;
1967 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1968 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
1969 else
1970 drop_reason = tcp_inbound_md5_hash(sk, skb,
1971 &iph->saddr, &iph->daddr,
1972 AF_INET, dif, sdif);
1973 if (unlikely(drop_reason)) {
1974 sk_drops_add(sk, skb);
1975 reqsk_put(req);
1976 goto discard_it;
1977 }
1978 if (tcp_checksum_complete(skb)) {
1979 reqsk_put(req);
1980 goto csum_error;
1981 }
1982 if (unlikely(sk->sk_state != TCP_LISTEN)) {
1983 nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb);
1984 if (!nsk) {
1985 inet_csk_reqsk_queue_drop_and_put(sk, req);
1986 goto lookup;
1987 }
1988 sk = nsk;
1989 /* reuseport_migrate_sock() has already held one sk_refcnt
1990 * before returning.
1991 */
1992 } else {
1993 /* We own a reference on the listener, increase it again
1994 * as we might lose it too soon.
1995 */
1996 sock_hold(sk);
1997 }
1998 refcounted = true;
1999 nsk = NULL;
2000 if (!tcp_filter(sk, skb)) {
2001 th = (const struct tcphdr *)skb->data;
2002 iph = ip_hdr(skb);
2003 tcp_v4_fill_cb(skb, iph, th);
2004 nsk = tcp_check_req(sk, skb, req, false, &req_stolen);
2005 } else {
2006 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2007 }
2008 if (!nsk) {
2009 reqsk_put(req);
2010 if (req_stolen) {
2011 /* Another cpu got exclusive access to req
2012 * and created a full blown socket.
2013 * Try to feed this packet to this socket
2014 * instead of discarding it.
2015 */
2016 tcp_v4_restore_cb(skb);
2017 sock_put(sk);
2018 goto lookup;
2019 }
2020 goto discard_and_relse;
2021 }
2022 nf_reset_ct(skb);
2023 if (nsk == sk) {
2024 reqsk_put(req);
2025 tcp_v4_restore_cb(skb);
2026 } else if (tcp_child_process(sk, nsk, skb)) {
2027 tcp_v4_send_reset(nsk, skb);
2028 goto discard_and_relse;
2029 } else {
2030 sock_put(sk);
2031 return 0;
2032 }
2033 }
2034
2035 if (static_branch_unlikely(&ip4_min_ttl)) {
2036 /* min_ttl can be changed concurrently from do_ip_setsockopt() */
2037 if (unlikely(iph->ttl < READ_ONCE(inet_sk(sk)->min_ttl))) {
2038 __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP);
2039 goto discard_and_relse;
2040 }
2041 }
2042
2043 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
2044 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2045 goto discard_and_relse;
2046 }
2047
2048 drop_reason = tcp_inbound_md5_hash(sk, skb, &iph->saddr,
2049 &iph->daddr, AF_INET, dif, sdif);
2050 if (drop_reason)
2051 goto discard_and_relse;
2052
2053 nf_reset_ct(skb);
2054
2055 if (tcp_filter(sk, skb)) {
2056 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
2057 goto discard_and_relse;
2058 }
2059 th = (const struct tcphdr *)skb->data;
2060 iph = ip_hdr(skb);
2061 tcp_v4_fill_cb(skb, iph, th);
2062
2063 skb->dev = NULL;
2064
2065 if (sk->sk_state == TCP_LISTEN) {
2066 ret = tcp_v4_do_rcv(sk, skb);
2067 goto put_and_return;
2068 }
2069
2070 sk_incoming_cpu_update(sk);
2071
2072 bh_lock_sock_nested(sk);
2073 tcp_segs_in(tcp_sk(sk), skb);
2074 ret = 0;
2075 if (!sock_owned_by_user(sk)) {
2076 ret = tcp_v4_do_rcv(sk, skb);
2077 } else {
2078 if (tcp_add_backlog(sk, skb, &drop_reason))
2079 goto discard_and_relse;
2080 }
2081 bh_unlock_sock(sk);
2082
2083 put_and_return:
2084 if (refcounted)
2085 sock_put(sk);
2086
2087 return ret;
2088
2089 no_tcp_socket:
2090 drop_reason = SKB_DROP_REASON_NO_SOCKET;
2091 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
2092 goto discard_it;
2093
2094 tcp_v4_fill_cb(skb, iph, th);
2095
2096 if (tcp_checksum_complete(skb)) {
2097 csum_error:
2098 drop_reason = SKB_DROP_REASON_TCP_CSUM;
2099 trace_tcp_bad_csum(skb);
2100 __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS);
2101 bad_packet:
2102 __TCP_INC_STATS(net, TCP_MIB_INERRS);
2103 } else {
2104 tcp_v4_send_reset(NULL, skb);
2105 }
2106
2107 discard_it:
2108 SKB_DR_OR(drop_reason, NOT_SPECIFIED);
2109 /* Discard frame. */
2110 kfree_skb_reason(skb, drop_reason);
2111 return 0;
2112
2113 discard_and_relse:
2114 sk_drops_add(sk, skb);
2115 if (refcounted)
2116 sock_put(sk);
2117 goto discard_it;
2118
2119 do_time_wait:
2120 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
2121 drop_reason = SKB_DROP_REASON_XFRM_POLICY;
2122 inet_twsk_put(inet_twsk(sk));
2123 goto discard_it;
2124 }
2125
2126 tcp_v4_fill_cb(skb, iph, th);
2127
2128 if (tcp_checksum_complete(skb)) {
2129 inet_twsk_put(inet_twsk(sk));
2130 goto csum_error;
2131 }
2132 switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
2133 case TCP_TW_SYN: {
2134 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
2135 &tcp_hashinfo, skb,
2136 __tcp_hdrlen(th),
2137 iph->saddr, th->source,
2138 iph->daddr, th->dest,
2139 inet_iif(skb),
2140 sdif);
2141 if (sk2) {
2142 inet_twsk_deschedule_put(inet_twsk(sk));
2143 sk = sk2;
2144 tcp_v4_restore_cb(skb);
2145 refcounted = false;
2146 goto process;
2147 }
2148 }
2149 /* to ACK */
2150 fallthrough;
2151 case TCP_TW_ACK:
2152 tcp_v4_timewait_ack(sk, skb);
2153 break;
2154 case TCP_TW_RST:
2155 tcp_v4_send_reset(sk, skb);
2156 inet_twsk_deschedule_put(inet_twsk(sk));
2157 goto discard_it;
2158 case TCP_TW_SUCCESS:;
2159 }
2160 goto discard_it;
2161 }
2162
2163 static struct timewait_sock_ops tcp_timewait_sock_ops = {
2164 .twsk_obj_size = sizeof(struct tcp_timewait_sock),
2165 .twsk_unique = tcp_twsk_unique,
2166 .twsk_destructor= tcp_twsk_destructor,
2167 };
2168
inet_sk_rx_dst_set(struct sock * sk,const struct sk_buff * skb)2169 void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
2170 {
2171 struct dst_entry *dst = skb_dst(skb);
2172
2173 if (dst && dst_hold_safe(dst)) {
2174 rcu_assign_pointer(sk->sk_rx_dst, dst);
2175 sk->sk_rx_dst_ifindex = skb->skb_iif;
2176 }
2177 }
2178 EXPORT_SYMBOL(inet_sk_rx_dst_set);
2179
2180 const struct inet_connection_sock_af_ops ipv4_specific = {
2181 .queue_xmit = ip_queue_xmit,
2182 .send_check = tcp_v4_send_check,
2183 .rebuild_header = inet_sk_rebuild_header,
2184 .sk_rx_dst_set = inet_sk_rx_dst_set,
2185 .conn_request = tcp_v4_conn_request,
2186 .syn_recv_sock = tcp_v4_syn_recv_sock,
2187 .net_header_len = sizeof(struct iphdr),
2188 .setsockopt = ip_setsockopt,
2189 .getsockopt = ip_getsockopt,
2190 .addr2sockaddr = inet_csk_addr2sockaddr,
2191 .sockaddr_len = sizeof(struct sockaddr_in),
2192 .mtu_reduced = tcp_v4_mtu_reduced,
2193 };
2194 EXPORT_SYMBOL(ipv4_specific);
2195
2196 #ifdef CONFIG_TCP_MD5SIG
2197 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
2198 .md5_lookup = tcp_v4_md5_lookup,
2199 .calc_md5_hash = tcp_v4_md5_hash_skb,
2200 .md5_parse = tcp_v4_parse_md5_keys,
2201 };
2202 #endif
2203
2204 /* NOTE: A lot of things set to zero explicitly by call to
2205 * sk_alloc() so need not be done here.
2206 */
tcp_v4_init_sock(struct sock * sk)2207 static int tcp_v4_init_sock(struct sock *sk)
2208 {
2209 struct inet_connection_sock *icsk = inet_csk(sk);
2210
2211 tcp_init_sock(sk);
2212
2213 icsk->icsk_af_ops = &ipv4_specific;
2214
2215 #ifdef CONFIG_TCP_MD5SIG
2216 tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
2217 #endif
2218
2219 return 0;
2220 }
2221
tcp_v4_destroy_sock(struct sock * sk)2222 void tcp_v4_destroy_sock(struct sock *sk)
2223 {
2224 struct tcp_sock *tp = tcp_sk(sk);
2225
2226 trace_tcp_destroy_sock(sk);
2227
2228 tcp_clear_xmit_timers(sk);
2229
2230 tcp_cleanup_congestion_control(sk);
2231
2232 tcp_cleanup_ulp(sk);
2233
2234 /* Cleanup up the write buffer. */
2235 tcp_write_queue_purge(sk);
2236
2237 /* Check if we want to disable active TFO */
2238 tcp_fastopen_active_disable_ofo_check(sk);
2239
2240 /* Cleans up our, hopefully empty, out_of_order_queue. */
2241 skb_rbtree_purge(&tp->out_of_order_queue);
2242
2243 #ifdef CONFIG_TCP_MD5SIG
2244 /* Clean up the MD5 key list, if any */
2245 if (tp->md5sig_info) {
2246 tcp_clear_md5_list(sk);
2247 kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
2248 tp->md5sig_info = NULL;
2249 }
2250 #endif
2251
2252 /* Clean up a referenced TCP bind bucket. */
2253 if (inet_csk(sk)->icsk_bind_hash)
2254 inet_put_port(sk);
2255
2256 BUG_ON(rcu_access_pointer(tp->fastopen_rsk));
2257
2258 /* If socket is aborted during connect operation */
2259 tcp_free_fastopen_req(tp);
2260 tcp_fastopen_destroy_cipher(sk);
2261 tcp_saved_syn_free(tp);
2262
2263 sk_sockets_allocated_dec(sk);
2264 }
2265 EXPORT_SYMBOL(tcp_v4_destroy_sock);
2266
2267 #ifdef CONFIG_PROC_FS
2268 /* Proc filesystem TCP sock list dumping. */
2269
2270 static unsigned short seq_file_family(const struct seq_file *seq);
2271
seq_sk_match(struct seq_file * seq,const struct sock * sk)2272 static bool seq_sk_match(struct seq_file *seq, const struct sock *sk)
2273 {
2274 unsigned short family = seq_file_family(seq);
2275
2276 /* AF_UNSPEC is used as a match all */
2277 return ((family == AF_UNSPEC || family == sk->sk_family) &&
2278 net_eq(sock_net(sk), seq_file_net(seq)));
2279 }
2280
2281 /* Find a non empty bucket (starting from st->bucket)
2282 * and return the first sk from it.
2283 */
listening_get_first(struct seq_file * seq)2284 static void *listening_get_first(struct seq_file *seq)
2285 {
2286 struct tcp_iter_state *st = seq->private;
2287
2288 st->offset = 0;
2289 for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) {
2290 struct inet_listen_hashbucket *ilb2;
2291 struct hlist_nulls_node *node;
2292 struct sock *sk;
2293
2294 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2295 if (hlist_nulls_empty(&ilb2->nulls_head))
2296 continue;
2297
2298 spin_lock(&ilb2->lock);
2299 sk_nulls_for_each(sk, node, &ilb2->nulls_head) {
2300 if (seq_sk_match(seq, sk))
2301 return sk;
2302 }
2303 spin_unlock(&ilb2->lock);
2304 }
2305
2306 return NULL;
2307 }
2308
2309 /* Find the next sk of "cur" within the same bucket (i.e. st->bucket).
2310 * If "cur" is the last one in the st->bucket,
2311 * call listening_get_first() to return the first sk of the next
2312 * non empty bucket.
2313 */
listening_get_next(struct seq_file * seq,void * cur)2314 static void *listening_get_next(struct seq_file *seq, void *cur)
2315 {
2316 struct tcp_iter_state *st = seq->private;
2317 struct inet_listen_hashbucket *ilb2;
2318 struct hlist_nulls_node *node;
2319 struct sock *sk = cur;
2320
2321 ++st->num;
2322 ++st->offset;
2323
2324 sk = sk_nulls_next(sk);
2325 sk_nulls_for_each_from(sk, node) {
2326 if (seq_sk_match(seq, sk))
2327 return sk;
2328 }
2329
2330 ilb2 = &tcp_hashinfo.lhash2[st->bucket];
2331 spin_unlock(&ilb2->lock);
2332 ++st->bucket;
2333 return listening_get_first(seq);
2334 }
2335
listening_get_idx(struct seq_file * seq,loff_t * pos)2336 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2337 {
2338 struct tcp_iter_state *st = seq->private;
2339 void *rc;
2340
2341 st->bucket = 0;
2342 st->offset = 0;
2343 rc = listening_get_first(seq);
2344
2345 while (rc && *pos) {
2346 rc = listening_get_next(seq, rc);
2347 --*pos;
2348 }
2349 return rc;
2350 }
2351
empty_bucket(const struct tcp_iter_state * st)2352 static inline bool empty_bucket(const struct tcp_iter_state *st)
2353 {
2354 return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain);
2355 }
2356
2357 /*
2358 * Get first established socket starting from bucket given in st->bucket.
2359 * If st->bucket is zero, the very first socket in the hash is returned.
2360 */
established_get_first(struct seq_file * seq)2361 static void *established_get_first(struct seq_file *seq)
2362 {
2363 struct tcp_iter_state *st = seq->private;
2364
2365 st->offset = 0;
2366 for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2367 struct sock *sk;
2368 struct hlist_nulls_node *node;
2369 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2370
2371 /* Lockless fast path for the common case of empty buckets */
2372 if (empty_bucket(st))
2373 continue;
2374
2375 spin_lock_bh(lock);
2376 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2377 if (seq_sk_match(seq, sk))
2378 return sk;
2379 }
2380 spin_unlock_bh(lock);
2381 }
2382
2383 return NULL;
2384 }
2385
established_get_next(struct seq_file * seq,void * cur)2386 static void *established_get_next(struct seq_file *seq, void *cur)
2387 {
2388 struct sock *sk = cur;
2389 struct hlist_nulls_node *node;
2390 struct tcp_iter_state *st = seq->private;
2391
2392 ++st->num;
2393 ++st->offset;
2394
2395 sk = sk_nulls_next(sk);
2396
2397 sk_nulls_for_each_from(sk, node) {
2398 if (seq_sk_match(seq, sk))
2399 return sk;
2400 }
2401
2402 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2403 ++st->bucket;
2404 return established_get_first(seq);
2405 }
2406
established_get_idx(struct seq_file * seq,loff_t pos)2407 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2408 {
2409 struct tcp_iter_state *st = seq->private;
2410 void *rc;
2411
2412 st->bucket = 0;
2413 rc = established_get_first(seq);
2414
2415 while (rc && pos) {
2416 rc = established_get_next(seq, rc);
2417 --pos;
2418 }
2419 return rc;
2420 }
2421
tcp_get_idx(struct seq_file * seq,loff_t pos)2422 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2423 {
2424 void *rc;
2425 struct tcp_iter_state *st = seq->private;
2426
2427 st->state = TCP_SEQ_STATE_LISTENING;
2428 rc = listening_get_idx(seq, &pos);
2429
2430 if (!rc) {
2431 st->state = TCP_SEQ_STATE_ESTABLISHED;
2432 rc = established_get_idx(seq, pos);
2433 }
2434
2435 return rc;
2436 }
2437
tcp_seek_last_pos(struct seq_file * seq)2438 static void *tcp_seek_last_pos(struct seq_file *seq)
2439 {
2440 struct tcp_iter_state *st = seq->private;
2441 int bucket = st->bucket;
2442 int offset = st->offset;
2443 int orig_num = st->num;
2444 void *rc = NULL;
2445
2446 switch (st->state) {
2447 case TCP_SEQ_STATE_LISTENING:
2448 if (st->bucket > tcp_hashinfo.lhash2_mask)
2449 break;
2450 st->state = TCP_SEQ_STATE_LISTENING;
2451 rc = listening_get_first(seq);
2452 while (offset-- && rc && bucket == st->bucket)
2453 rc = listening_get_next(seq, rc);
2454 if (rc)
2455 break;
2456 st->bucket = 0;
2457 st->state = TCP_SEQ_STATE_ESTABLISHED;
2458 fallthrough;
2459 case TCP_SEQ_STATE_ESTABLISHED:
2460 if (st->bucket > tcp_hashinfo.ehash_mask)
2461 break;
2462 rc = established_get_first(seq);
2463 while (offset-- && rc && bucket == st->bucket)
2464 rc = established_get_next(seq, rc);
2465 }
2466
2467 st->num = orig_num;
2468
2469 return rc;
2470 }
2471
tcp_seq_start(struct seq_file * seq,loff_t * pos)2472 void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2473 {
2474 struct tcp_iter_state *st = seq->private;
2475 void *rc;
2476
2477 if (*pos && *pos == st->last_pos) {
2478 rc = tcp_seek_last_pos(seq);
2479 if (rc)
2480 goto out;
2481 }
2482
2483 st->state = TCP_SEQ_STATE_LISTENING;
2484 st->num = 0;
2485 st->bucket = 0;
2486 st->offset = 0;
2487 rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2488
2489 out:
2490 st->last_pos = *pos;
2491 return rc;
2492 }
2493 EXPORT_SYMBOL(tcp_seq_start);
2494
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2495 void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2496 {
2497 struct tcp_iter_state *st = seq->private;
2498 void *rc = NULL;
2499
2500 if (v == SEQ_START_TOKEN) {
2501 rc = tcp_get_idx(seq, 0);
2502 goto out;
2503 }
2504
2505 switch (st->state) {
2506 case TCP_SEQ_STATE_LISTENING:
2507 rc = listening_get_next(seq, v);
2508 if (!rc) {
2509 st->state = TCP_SEQ_STATE_ESTABLISHED;
2510 st->bucket = 0;
2511 st->offset = 0;
2512 rc = established_get_first(seq);
2513 }
2514 break;
2515 case TCP_SEQ_STATE_ESTABLISHED:
2516 rc = established_get_next(seq, v);
2517 break;
2518 }
2519 out:
2520 ++*pos;
2521 st->last_pos = *pos;
2522 return rc;
2523 }
2524 EXPORT_SYMBOL(tcp_seq_next);
2525
tcp_seq_stop(struct seq_file * seq,void * v)2526 void tcp_seq_stop(struct seq_file *seq, void *v)
2527 {
2528 struct tcp_iter_state *st = seq->private;
2529
2530 switch (st->state) {
2531 case TCP_SEQ_STATE_LISTENING:
2532 if (v != SEQ_START_TOKEN)
2533 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2534 break;
2535 case TCP_SEQ_STATE_ESTABLISHED:
2536 if (v)
2537 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2538 break;
2539 }
2540 }
2541 EXPORT_SYMBOL(tcp_seq_stop);
2542
get_openreq4(const struct request_sock * req,struct seq_file * f,int i)2543 static void get_openreq4(const struct request_sock *req,
2544 struct seq_file *f, int i)
2545 {
2546 const struct inet_request_sock *ireq = inet_rsk(req);
2547 long delta = req->rsk_timer.expires - jiffies;
2548
2549 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2550 " %02X %08X:%08X %02X:%08lX %08X %5u %8d %u %d %pK",
2551 i,
2552 ireq->ir_loc_addr,
2553 ireq->ir_num,
2554 ireq->ir_rmt_addr,
2555 ntohs(ireq->ir_rmt_port),
2556 TCP_SYN_RECV,
2557 0, 0, /* could print option size, but that is af dependent. */
2558 1, /* timers active (only the expire timer) */
2559 jiffies_delta_to_clock_t(delta),
2560 req->num_timeout,
2561 from_kuid_munged(seq_user_ns(f),
2562 sock_i_uid(req->rsk_listener)),
2563 0, /* non standard timer */
2564 0, /* open_requests have no inode */
2565 0,
2566 req);
2567 }
2568
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i)2569 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
2570 {
2571 int timer_active;
2572 unsigned long timer_expires;
2573 const struct tcp_sock *tp = tcp_sk(sk);
2574 const struct inet_connection_sock *icsk = inet_csk(sk);
2575 const struct inet_sock *inet = inet_sk(sk);
2576 const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
2577 __be32 dest = inet->inet_daddr;
2578 __be32 src = inet->inet_rcv_saddr;
2579 __u16 destp = ntohs(inet->inet_dport);
2580 __u16 srcp = ntohs(inet->inet_sport);
2581 int rx_queue;
2582 int state;
2583
2584 if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
2585 icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT ||
2586 icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
2587 timer_active = 1;
2588 timer_expires = icsk->icsk_timeout;
2589 } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2590 timer_active = 4;
2591 timer_expires = icsk->icsk_timeout;
2592 } else if (timer_pending(&sk->sk_timer)) {
2593 timer_active = 2;
2594 timer_expires = sk->sk_timer.expires;
2595 } else {
2596 timer_active = 0;
2597 timer_expires = jiffies;
2598 }
2599
2600 state = inet_sk_state_load(sk);
2601 if (state == TCP_LISTEN)
2602 rx_queue = READ_ONCE(sk->sk_ack_backlog);
2603 else
2604 /* Because we don't lock the socket,
2605 * we might find a transient negative value.
2606 */
2607 rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) -
2608 READ_ONCE(tp->copied_seq), 0);
2609
2610 seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2611 "%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
2612 i, src, srcp, dest, destp, state,
2613 READ_ONCE(tp->write_seq) - tp->snd_una,
2614 rx_queue,
2615 timer_active,
2616 jiffies_delta_to_clock_t(timer_expires - jiffies),
2617 icsk->icsk_retransmits,
2618 from_kuid_munged(seq_user_ns(f), sock_i_uid(sk)),
2619 icsk->icsk_probes_out,
2620 sock_i_ino(sk),
2621 refcount_read(&sk->sk_refcnt), sk,
2622 jiffies_to_clock_t(icsk->icsk_rto),
2623 jiffies_to_clock_t(icsk->icsk_ack.ato),
2624 (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk),
2625 tcp_snd_cwnd(tp),
2626 state == TCP_LISTEN ?
2627 fastopenq->max_qlen :
2628 (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
2629 }
2630
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i)2631 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2632 struct seq_file *f, int i)
2633 {
2634 long delta = tw->tw_timer.expires - jiffies;
2635 __be32 dest, src;
2636 __u16 destp, srcp;
2637
2638 dest = tw->tw_daddr;
2639 src = tw->tw_rcv_saddr;
2640 destp = ntohs(tw->tw_dport);
2641 srcp = ntohs(tw->tw_sport);
2642
2643 seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2644 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK",
2645 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2646 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0,
2647 refcount_read(&tw->tw_refcnt), tw);
2648 }
2649
2650 #define TMPSZ 150
2651
tcp4_seq_show(struct seq_file * seq,void * v)2652 static int tcp4_seq_show(struct seq_file *seq, void *v)
2653 {
2654 struct tcp_iter_state *st;
2655 struct sock *sk = v;
2656
2657 seq_setwidth(seq, TMPSZ - 1);
2658 if (v == SEQ_START_TOKEN) {
2659 seq_puts(seq, " sl local_address rem_address st tx_queue "
2660 "rx_queue tr tm->when retrnsmt uid timeout "
2661 "inode");
2662 goto out;
2663 }
2664 st = seq->private;
2665
2666 if (sk->sk_state == TCP_TIME_WAIT)
2667 get_timewait4_sock(v, seq, st->num);
2668 else if (sk->sk_state == TCP_NEW_SYN_RECV)
2669 get_openreq4(v, seq, st->num);
2670 else
2671 get_tcp4_sock(v, seq, st->num);
2672 out:
2673 seq_pad(seq, '\n');
2674 return 0;
2675 }
2676
2677 #ifdef CONFIG_BPF_SYSCALL
2678 struct bpf_tcp_iter_state {
2679 struct tcp_iter_state state;
2680 unsigned int cur_sk;
2681 unsigned int end_sk;
2682 unsigned int max_sk;
2683 struct sock **batch;
2684 bool st_bucket_done;
2685 };
2686
2687 struct bpf_iter__tcp {
2688 __bpf_md_ptr(struct bpf_iter_meta *, meta);
2689 __bpf_md_ptr(struct sock_common *, sk_common);
2690 uid_t uid __aligned(8);
2691 };
2692
tcp_prog_seq_show(struct bpf_prog * prog,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)2693 static int tcp_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
2694 struct sock_common *sk_common, uid_t uid)
2695 {
2696 struct bpf_iter__tcp ctx;
2697
2698 meta->seq_num--; /* skip SEQ_START_TOKEN */
2699 ctx.meta = meta;
2700 ctx.sk_common = sk_common;
2701 ctx.uid = uid;
2702 return bpf_iter_run_prog(prog, &ctx);
2703 }
2704
bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state * iter)2705 static void bpf_iter_tcp_put_batch(struct bpf_tcp_iter_state *iter)
2706 {
2707 while (iter->cur_sk < iter->end_sk)
2708 sock_put(iter->batch[iter->cur_sk++]);
2709 }
2710
bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state * iter,unsigned int new_batch_sz)2711 static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter,
2712 unsigned int new_batch_sz)
2713 {
2714 struct sock **new_batch;
2715
2716 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
2717 GFP_USER | __GFP_NOWARN);
2718 if (!new_batch)
2719 return -ENOMEM;
2720
2721 bpf_iter_tcp_put_batch(iter);
2722 kvfree(iter->batch);
2723 iter->batch = new_batch;
2724 iter->max_sk = new_batch_sz;
2725
2726 return 0;
2727 }
2728
bpf_iter_tcp_listening_batch(struct seq_file * seq,struct sock * start_sk)2729 static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq,
2730 struct sock *start_sk)
2731 {
2732 struct bpf_tcp_iter_state *iter = seq->private;
2733 struct tcp_iter_state *st = &iter->state;
2734 struct hlist_nulls_node *node;
2735 unsigned int expected = 1;
2736 struct sock *sk;
2737
2738 sock_hold(start_sk);
2739 iter->batch[iter->end_sk++] = start_sk;
2740
2741 sk = sk_nulls_next(start_sk);
2742 sk_nulls_for_each_from(sk, node) {
2743 if (seq_sk_match(seq, sk)) {
2744 if (iter->end_sk < iter->max_sk) {
2745 sock_hold(sk);
2746 iter->batch[iter->end_sk++] = sk;
2747 }
2748 expected++;
2749 }
2750 }
2751 spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock);
2752
2753 return expected;
2754 }
2755
bpf_iter_tcp_established_batch(struct seq_file * seq,struct sock * start_sk)2756 static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq,
2757 struct sock *start_sk)
2758 {
2759 struct bpf_tcp_iter_state *iter = seq->private;
2760 struct tcp_iter_state *st = &iter->state;
2761 struct hlist_nulls_node *node;
2762 unsigned int expected = 1;
2763 struct sock *sk;
2764
2765 sock_hold(start_sk);
2766 iter->batch[iter->end_sk++] = start_sk;
2767
2768 sk = sk_nulls_next(start_sk);
2769 sk_nulls_for_each_from(sk, node) {
2770 if (seq_sk_match(seq, sk)) {
2771 if (iter->end_sk < iter->max_sk) {
2772 sock_hold(sk);
2773 iter->batch[iter->end_sk++] = sk;
2774 }
2775 expected++;
2776 }
2777 }
2778 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2779
2780 return expected;
2781 }
2782
bpf_iter_tcp_batch(struct seq_file * seq)2783 static struct sock *bpf_iter_tcp_batch(struct seq_file *seq)
2784 {
2785 struct bpf_tcp_iter_state *iter = seq->private;
2786 struct tcp_iter_state *st = &iter->state;
2787 unsigned int expected;
2788 bool resized = false;
2789 struct sock *sk;
2790
2791 /* The st->bucket is done. Directly advance to the next
2792 * bucket instead of having the tcp_seek_last_pos() to skip
2793 * one by one in the current bucket and eventually find out
2794 * it has to advance to the next bucket.
2795 */
2796 if (iter->st_bucket_done) {
2797 st->offset = 0;
2798 st->bucket++;
2799 if (st->state == TCP_SEQ_STATE_LISTENING &&
2800 st->bucket > tcp_hashinfo.lhash2_mask) {
2801 st->state = TCP_SEQ_STATE_ESTABLISHED;
2802 st->bucket = 0;
2803 }
2804 }
2805
2806 again:
2807 /* Get a new batch */
2808 iter->cur_sk = 0;
2809 iter->end_sk = 0;
2810 iter->st_bucket_done = false;
2811
2812 sk = tcp_seek_last_pos(seq);
2813 if (!sk)
2814 return NULL; /* Done */
2815
2816 if (st->state == TCP_SEQ_STATE_LISTENING)
2817 expected = bpf_iter_tcp_listening_batch(seq, sk);
2818 else
2819 expected = bpf_iter_tcp_established_batch(seq, sk);
2820
2821 if (iter->end_sk == expected) {
2822 iter->st_bucket_done = true;
2823 return sk;
2824 }
2825
2826 if (!resized && !bpf_iter_tcp_realloc_batch(iter, expected * 3 / 2)) {
2827 resized = true;
2828 goto again;
2829 }
2830
2831 return sk;
2832 }
2833
bpf_iter_tcp_seq_start(struct seq_file * seq,loff_t * pos)2834 static void *bpf_iter_tcp_seq_start(struct seq_file *seq, loff_t *pos)
2835 {
2836 /* bpf iter does not support lseek, so it always
2837 * continue from where it was stop()-ped.
2838 */
2839 if (*pos)
2840 return bpf_iter_tcp_batch(seq);
2841
2842 return SEQ_START_TOKEN;
2843 }
2844
bpf_iter_tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2845 static void *bpf_iter_tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2846 {
2847 struct bpf_tcp_iter_state *iter = seq->private;
2848 struct tcp_iter_state *st = &iter->state;
2849 struct sock *sk;
2850
2851 /* Whenever seq_next() is called, the iter->cur_sk is
2852 * done with seq_show(), so advance to the next sk in
2853 * the batch.
2854 */
2855 if (iter->cur_sk < iter->end_sk) {
2856 /* Keeping st->num consistent in tcp_iter_state.
2857 * bpf_iter_tcp does not use st->num.
2858 * meta.seq_num is used instead.
2859 */
2860 st->num++;
2861 /* Move st->offset to the next sk in the bucket such that
2862 * the future start() will resume at st->offset in
2863 * st->bucket. See tcp_seek_last_pos().
2864 */
2865 st->offset++;
2866 sock_put(iter->batch[iter->cur_sk++]);
2867 }
2868
2869 if (iter->cur_sk < iter->end_sk)
2870 sk = iter->batch[iter->cur_sk];
2871 else
2872 sk = bpf_iter_tcp_batch(seq);
2873
2874 ++*pos;
2875 /* Keeping st->last_pos consistent in tcp_iter_state.
2876 * bpf iter does not do lseek, so st->last_pos always equals to *pos.
2877 */
2878 st->last_pos = *pos;
2879 return sk;
2880 }
2881
bpf_iter_tcp_seq_show(struct seq_file * seq,void * v)2882 static int bpf_iter_tcp_seq_show(struct seq_file *seq, void *v)
2883 {
2884 struct bpf_iter_meta meta;
2885 struct bpf_prog *prog;
2886 struct sock *sk = v;
2887 bool slow;
2888 uid_t uid;
2889 int ret;
2890
2891 if (v == SEQ_START_TOKEN)
2892 return 0;
2893
2894 if (sk_fullsock(sk))
2895 slow = lock_sock_fast(sk);
2896
2897 if (unlikely(sk_unhashed(sk))) {
2898 ret = SEQ_SKIP;
2899 goto unlock;
2900 }
2901
2902 if (sk->sk_state == TCP_TIME_WAIT) {
2903 uid = 0;
2904 } else if (sk->sk_state == TCP_NEW_SYN_RECV) {
2905 const struct request_sock *req = v;
2906
2907 uid = from_kuid_munged(seq_user_ns(seq),
2908 sock_i_uid(req->rsk_listener));
2909 } else {
2910 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
2911 }
2912
2913 meta.seq = seq;
2914 prog = bpf_iter_get_info(&meta, false);
2915 ret = tcp_prog_seq_show(prog, &meta, v, uid);
2916
2917 unlock:
2918 if (sk_fullsock(sk))
2919 unlock_sock_fast(sk, slow);
2920 return ret;
2921
2922 }
2923
bpf_iter_tcp_seq_stop(struct seq_file * seq,void * v)2924 static void bpf_iter_tcp_seq_stop(struct seq_file *seq, void *v)
2925 {
2926 struct bpf_tcp_iter_state *iter = seq->private;
2927 struct bpf_iter_meta meta;
2928 struct bpf_prog *prog;
2929
2930 if (!v) {
2931 meta.seq = seq;
2932 prog = bpf_iter_get_info(&meta, true);
2933 if (prog)
2934 (void)tcp_prog_seq_show(prog, &meta, v, 0);
2935 }
2936
2937 if (iter->cur_sk < iter->end_sk) {
2938 bpf_iter_tcp_put_batch(iter);
2939 iter->st_bucket_done = false;
2940 }
2941 }
2942
2943 static const struct seq_operations bpf_iter_tcp_seq_ops = {
2944 .show = bpf_iter_tcp_seq_show,
2945 .start = bpf_iter_tcp_seq_start,
2946 .next = bpf_iter_tcp_seq_next,
2947 .stop = bpf_iter_tcp_seq_stop,
2948 };
2949 #endif
seq_file_family(const struct seq_file * seq)2950 static unsigned short seq_file_family(const struct seq_file *seq)
2951 {
2952 const struct tcp_seq_afinfo *afinfo;
2953
2954 #ifdef CONFIG_BPF_SYSCALL
2955 /* Iterated from bpf_iter. Let the bpf prog to filter instead. */
2956 if (seq->op == &bpf_iter_tcp_seq_ops)
2957 return AF_UNSPEC;
2958 #endif
2959
2960 /* Iterated from proc fs */
2961 afinfo = pde_data(file_inode(seq->file));
2962 return afinfo->family;
2963 }
2964
2965 static const struct seq_operations tcp4_seq_ops = {
2966 .show = tcp4_seq_show,
2967 .start = tcp_seq_start,
2968 .next = tcp_seq_next,
2969 .stop = tcp_seq_stop,
2970 };
2971
2972 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2973 .family = AF_INET,
2974 };
2975
tcp4_proc_init_net(struct net * net)2976 static int __net_init tcp4_proc_init_net(struct net *net)
2977 {
2978 if (!proc_create_net_data("tcp", 0444, net->proc_net, &tcp4_seq_ops,
2979 sizeof(struct tcp_iter_state), &tcp4_seq_afinfo))
2980 return -ENOMEM;
2981 return 0;
2982 }
2983
tcp4_proc_exit_net(struct net * net)2984 static void __net_exit tcp4_proc_exit_net(struct net *net)
2985 {
2986 remove_proc_entry("tcp", net->proc_net);
2987 }
2988
2989 static struct pernet_operations tcp4_net_ops = {
2990 .init = tcp4_proc_init_net,
2991 .exit = tcp4_proc_exit_net,
2992 };
2993
tcp4_proc_init(void)2994 int __init tcp4_proc_init(void)
2995 {
2996 return register_pernet_subsys(&tcp4_net_ops);
2997 }
2998
tcp4_proc_exit(void)2999 void tcp4_proc_exit(void)
3000 {
3001 unregister_pernet_subsys(&tcp4_net_ops);
3002 }
3003 #endif /* CONFIG_PROC_FS */
3004
3005 /* @wake is one when sk_stream_write_space() calls us.
3006 * This sends EPOLLOUT only if notsent_bytes is half the limit.
3007 * This mimics the strategy used in sock_def_write_space().
3008 */
tcp_stream_memory_free(const struct sock * sk,int wake)3009 bool tcp_stream_memory_free(const struct sock *sk, int wake)
3010 {
3011 const struct tcp_sock *tp = tcp_sk(sk);
3012 u32 notsent_bytes = READ_ONCE(tp->write_seq) -
3013 READ_ONCE(tp->snd_nxt);
3014
3015 return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
3016 }
3017 EXPORT_SYMBOL(tcp_stream_memory_free);
3018
3019 struct proto tcp_prot = {
3020 .name = "TCP",
3021 .owner = THIS_MODULE,
3022 .close = tcp_close,
3023 .pre_connect = tcp_v4_pre_connect,
3024 .connect = tcp_v4_connect,
3025 .disconnect = tcp_disconnect,
3026 .accept = inet_csk_accept,
3027 .ioctl = tcp_ioctl,
3028 .init = tcp_v4_init_sock,
3029 .destroy = tcp_v4_destroy_sock,
3030 .shutdown = tcp_shutdown,
3031 .setsockopt = tcp_setsockopt,
3032 .getsockopt = tcp_getsockopt,
3033 .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt,
3034 .keepalive = tcp_set_keepalive,
3035 .recvmsg = tcp_recvmsg,
3036 .sendmsg = tcp_sendmsg,
3037 .sendpage = tcp_sendpage,
3038 .backlog_rcv = tcp_v4_do_rcv,
3039 .release_cb = tcp_release_cb,
3040 .hash = inet_hash,
3041 .unhash = inet_unhash,
3042 .get_port = inet_csk_get_port,
3043 .put_port = inet_put_port,
3044 #ifdef CONFIG_BPF_SYSCALL
3045 .psock_update_sk_prot = tcp_bpf_update_proto,
3046 #endif
3047 .enter_memory_pressure = tcp_enter_memory_pressure,
3048 .leave_memory_pressure = tcp_leave_memory_pressure,
3049 .stream_memory_free = tcp_stream_memory_free,
3050 .sockets_allocated = &tcp_sockets_allocated,
3051 .orphan_count = &tcp_orphan_count,
3052 .memory_allocated = &tcp_memory_allocated,
3053 .memory_pressure = &tcp_memory_pressure,
3054 .sysctl_mem = sysctl_tcp_mem,
3055 .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
3056 .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
3057 .max_header = MAX_TCP_HEADER,
3058 .obj_size = sizeof(struct tcp_sock),
3059 .slab_flags = SLAB_TYPESAFE_BY_RCU,
3060 .twsk_prot = &tcp_timewait_sock_ops,
3061 .rsk_prot = &tcp_request_sock_ops,
3062 .h.hashinfo = &tcp_hashinfo,
3063 .no_autobind = true,
3064 .diag_destroy = tcp_abort,
3065 };
3066 EXPORT_SYMBOL(tcp_prot);
3067
tcp_sk_exit(struct net * net)3068 static void __net_exit tcp_sk_exit(struct net *net)
3069 {
3070 struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row;
3071
3072 if (net->ipv4.tcp_congestion_control)
3073 bpf_module_put(net->ipv4.tcp_congestion_control,
3074 net->ipv4.tcp_congestion_control->owner);
3075 if (refcount_dec_and_test(&tcp_death_row->tw_refcount))
3076 kfree(tcp_death_row);
3077 }
3078
tcp_sk_init(struct net * net)3079 static int __net_init tcp_sk_init(struct net *net)
3080 {
3081 int cnt;
3082
3083 net->ipv4.sysctl_tcp_ecn = 2;
3084 net->ipv4.sysctl_tcp_ecn_fallback = 1;
3085
3086 net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
3087 net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
3088 net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
3089 net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
3090 net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
3091
3092 net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
3093 net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
3094 net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
3095
3096 net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
3097 net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
3098 net->ipv4.sysctl_tcp_syncookies = 1;
3099 net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
3100 net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
3101 net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
3102 net->ipv4.sysctl_tcp_orphan_retries = 0;
3103 net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
3104 net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
3105 net->ipv4.sysctl_tcp_tw_reuse = 2;
3106 net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1;
3107
3108 net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL);
3109 if (!net->ipv4.tcp_death_row)
3110 return -ENOMEM;
3111 refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1);
3112 cnt = tcp_hashinfo.ehash_mask + 1;
3113 net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2;
3114 net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo;
3115
3116 net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128);
3117 net->ipv4.sysctl_tcp_sack = 1;
3118 net->ipv4.sysctl_tcp_window_scaling = 1;
3119 net->ipv4.sysctl_tcp_timestamps = 1;
3120 net->ipv4.sysctl_tcp_early_retrans = 3;
3121 net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION;
3122 net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */
3123 net->ipv4.sysctl_tcp_retrans_collapse = 1;
3124 net->ipv4.sysctl_tcp_max_reordering = 300;
3125 net->ipv4.sysctl_tcp_dsack = 1;
3126 net->ipv4.sysctl_tcp_app_win = 31;
3127 net->ipv4.sysctl_tcp_adv_win_scale = 1;
3128 net->ipv4.sysctl_tcp_frto = 2;
3129 net->ipv4.sysctl_tcp_moderate_rcvbuf = 1;
3130 /* This limits the percentage of the congestion window which we
3131 * will allow a single TSO frame to consume. Building TSO frames
3132 * which are too large can cause TCP streams to be bursty.
3133 */
3134 net->ipv4.sysctl_tcp_tso_win_divisor = 3;
3135 /* Default TSQ limit of 16 TSO segments */
3136 net->ipv4.sysctl_tcp_limit_output_bytes = 16 * 65536;
3137 /* rfc5961 challenge ack rate limiting */
3138 net->ipv4.sysctl_tcp_challenge_ack_limit = 1000;
3139 net->ipv4.sysctl_tcp_min_tso_segs = 2;
3140 net->ipv4.sysctl_tcp_tso_rtt_log = 9; /* 2^9 = 512 usec */
3141 net->ipv4.sysctl_tcp_min_rtt_wlen = 300;
3142 net->ipv4.sysctl_tcp_autocorking = 1;
3143 net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
3144 net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
3145 net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
3146 if (net != &init_net) {
3147 memcpy(net->ipv4.sysctl_tcp_rmem,
3148 init_net.ipv4.sysctl_tcp_rmem,
3149 sizeof(init_net.ipv4.sysctl_tcp_rmem));
3150 memcpy(net->ipv4.sysctl_tcp_wmem,
3151 init_net.ipv4.sysctl_tcp_wmem,
3152 sizeof(init_net.ipv4.sysctl_tcp_wmem));
3153 }
3154 net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC;
3155 net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC;
3156 net->ipv4.sysctl_tcp_comp_sack_nr = 44;
3157 net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
3158 net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
3159 atomic_set(&net->ipv4.tfo_active_disable_times, 0);
3160
3161 /* Reno is always built in */
3162 if (!net_eq(net, &init_net) &&
3163 bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
3164 init_net.ipv4.tcp_congestion_control->owner))
3165 net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
3166 else
3167 net->ipv4.tcp_congestion_control = &tcp_reno;
3168
3169 return 0;
3170 }
3171
tcp_sk_exit_batch(struct list_head * net_exit_list)3172 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
3173 {
3174 struct net *net;
3175
3176 inet_twsk_purge(&tcp_hashinfo, AF_INET);
3177
3178 list_for_each_entry(net, net_exit_list, exit_list)
3179 tcp_fastopen_ctx_destroy(net);
3180 }
3181
3182 static struct pernet_operations __net_initdata tcp_sk_ops = {
3183 .init = tcp_sk_init,
3184 .exit = tcp_sk_exit,
3185 .exit_batch = tcp_sk_exit_batch,
3186 };
3187
3188 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
DEFINE_BPF_ITER_FUNC(tcp,struct bpf_iter_meta * meta,struct sock_common * sk_common,uid_t uid)3189 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
3190 struct sock_common *sk_common, uid_t uid)
3191
3192 #define INIT_BATCH_SZ 16
3193
3194 static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
3195 {
3196 struct bpf_tcp_iter_state *iter = priv_data;
3197 int err;
3198
3199 err = bpf_iter_init_seq_net(priv_data, aux);
3200 if (err)
3201 return err;
3202
3203 err = bpf_iter_tcp_realloc_batch(iter, INIT_BATCH_SZ);
3204 if (err) {
3205 bpf_iter_fini_seq_net(priv_data);
3206 return err;
3207 }
3208
3209 return 0;
3210 }
3211
bpf_iter_fini_tcp(void * priv_data)3212 static void bpf_iter_fini_tcp(void *priv_data)
3213 {
3214 struct bpf_tcp_iter_state *iter = priv_data;
3215
3216 bpf_iter_fini_seq_net(priv_data);
3217 kvfree(iter->batch);
3218 }
3219
3220 static const struct bpf_iter_seq_info tcp_seq_info = {
3221 .seq_ops = &bpf_iter_tcp_seq_ops,
3222 .init_seq_private = bpf_iter_init_tcp,
3223 .fini_seq_private = bpf_iter_fini_tcp,
3224 .seq_priv_size = sizeof(struct bpf_tcp_iter_state),
3225 };
3226
3227 static const struct bpf_func_proto *
bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,const struct bpf_prog * prog)3228 bpf_iter_tcp_get_func_proto(enum bpf_func_id func_id,
3229 const struct bpf_prog *prog)
3230 {
3231 switch (func_id) {
3232 case BPF_FUNC_setsockopt:
3233 return &bpf_sk_setsockopt_proto;
3234 case BPF_FUNC_getsockopt:
3235 return &bpf_sk_getsockopt_proto;
3236 default:
3237 return NULL;
3238 }
3239 }
3240
3241 static struct bpf_iter_reg tcp_reg_info = {
3242 .target = "tcp",
3243 .ctx_arg_info_size = 1,
3244 .ctx_arg_info = {
3245 { offsetof(struct bpf_iter__tcp, sk_common),
3246 PTR_TO_BTF_ID_OR_NULL },
3247 },
3248 .get_func_proto = bpf_iter_tcp_get_func_proto,
3249 .seq_info = &tcp_seq_info,
3250 };
3251
bpf_iter_register(void)3252 static void __init bpf_iter_register(void)
3253 {
3254 tcp_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON];
3255 if (bpf_iter_reg_target(&tcp_reg_info))
3256 pr_warn("Warning: could not register bpf iterator tcp\n");
3257 }
3258
3259 #endif
3260
tcp_v4_init(void)3261 void __init tcp_v4_init(void)
3262 {
3263 int cpu, res;
3264
3265 for_each_possible_cpu(cpu) {
3266 struct sock *sk;
3267
3268 res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
3269 IPPROTO_TCP, &init_net);
3270 if (res)
3271 panic("Failed to create the TCP control socket.\n");
3272 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
3273
3274 /* Please enforce IP_DF and IPID==0 for RST and
3275 * ACK sent in SYN-RECV and TIME-WAIT state.
3276 */
3277 inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO;
3278
3279 per_cpu(ipv4_tcp_sk, cpu) = sk;
3280 }
3281 if (register_pernet_subsys(&tcp_sk_ops))
3282 panic("Failed to create the TCP control socket.\n");
3283
3284 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3285 bpf_iter_register();
3286 #endif
3287 }
3288