1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #define pr_fmt(fmt) "TCP: " fmt
54 
55 #include <linux/bottom_half.h>
56 #include <linux/types.h>
57 #include <linux/fcntl.h>
58 #include <linux/module.h>
59 #include <linux/random.h>
60 #include <linux/cache.h>
61 #include <linux/jhash.h>
62 #include <linux/init.h>
63 #include <linux/times.h>
64 #include <linux/slab.h>
65 
66 #include <net/net_namespace.h>
67 #include <net/icmp.h>
68 #include <net/inet_hashtables.h>
69 #include <net/tcp.h>
70 #include <net/transp_v6.h>
71 #include <net/ipv6.h>
72 #include <net/inet_common.h>
73 #include <net/timewait_sock.h>
74 #include <net/xfrm.h>
75 #include <net/netdma.h>
76 #include <net/secure_seq.h>
77 #include <net/tcp_memcontrol.h>
78 
79 #include <linux/inet.h>
80 #include <linux/ipv6.h>
81 #include <linux/stddef.h>
82 #include <linux/proc_fs.h>
83 #include <linux/seq_file.h>
84 
85 #include <linux/crypto.h>
86 #include <linux/scatterlist.h>
87 
88 int sysctl_tcp_tw_reuse __read_mostly;
89 int sysctl_tcp_low_latency __read_mostly;
90 EXPORT_SYMBOL(sysctl_tcp_low_latency);
91 
92 
93 #ifdef CONFIG_TCP_MD5SIG
94 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
95 			       __be32 daddr, __be32 saddr, const struct tcphdr *th);
96 #endif
97 
98 struct inet_hashinfo tcp_hashinfo;
99 EXPORT_SYMBOL(tcp_hashinfo);
100 
tcp_v4_init_sequence(const struct sk_buff * skb)101 static inline __u32 tcp_v4_init_sequence(const struct sk_buff *skb)
102 {
103 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
104 					  ip_hdr(skb)->saddr,
105 					  tcp_hdr(skb)->dest,
106 					  tcp_hdr(skb)->source);
107 }
108 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)109 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
110 {
111 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
112 	struct tcp_sock *tp = tcp_sk(sk);
113 
114 	/* With PAWS, it is safe from the viewpoint
115 	   of data integrity. Even without PAWS it is safe provided sequence
116 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
117 
118 	   Actually, the idea is close to VJ's one, only timestamp cache is
119 	   held not per host, but per port pair and TW bucket is used as state
120 	   holder.
121 
122 	   If TW bucket has been already destroyed we fall back to VJ's scheme
123 	   and use initial timestamp retrieved from peer table.
124 	 */
125 	if (tcptw->tw_ts_recent_stamp &&
126 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
127 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
128 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
129 		if (tp->write_seq == 0)
130 			tp->write_seq = 1;
131 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
132 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
133 		sock_hold(sktw);
134 		return 1;
135 	}
136 
137 	return 0;
138 }
139 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
140 
141 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)142 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
143 {
144 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
145 	struct inet_sock *inet = inet_sk(sk);
146 	struct tcp_sock *tp = tcp_sk(sk);
147 	__be16 orig_sport, orig_dport;
148 	__be32 daddr, nexthop;
149 	struct flowi4 *fl4;
150 	struct rtable *rt;
151 	int err;
152 	struct ip_options_rcu *inet_opt;
153 
154 	if (addr_len < sizeof(struct sockaddr_in))
155 		return -EINVAL;
156 
157 	if (usin->sin_family != AF_INET)
158 		return -EAFNOSUPPORT;
159 
160 	nexthop = daddr = usin->sin_addr.s_addr;
161 	inet_opt = rcu_dereference_protected(inet->inet_opt,
162 					     sock_owned_by_user(sk));
163 	if (inet_opt && inet_opt->opt.srr) {
164 		if (!daddr)
165 			return -EINVAL;
166 		nexthop = inet_opt->opt.faddr;
167 	}
168 
169 	orig_sport = inet->inet_sport;
170 	orig_dport = usin->sin_port;
171 	fl4 = &inet->cork.fl.u.ip4;
172 	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
173 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 			      IPPROTO_TCP,
175 			      orig_sport, orig_dport, sk, true);
176 	if (IS_ERR(rt)) {
177 		err = PTR_ERR(rt);
178 		if (err == -ENETUNREACH)
179 			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180 		return err;
181 	}
182 
183 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184 		ip_rt_put(rt);
185 		return -ENETUNREACH;
186 	}
187 
188 	if (!inet_opt || !inet_opt->opt.srr)
189 		daddr = fl4->daddr;
190 
191 	if (!inet->inet_saddr)
192 		inet->inet_saddr = fl4->saddr;
193 	inet->inet_rcv_saddr = inet->inet_saddr;
194 
195 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196 		/* Reset inherited state */
197 		tp->rx_opt.ts_recent	   = 0;
198 		tp->rx_opt.ts_recent_stamp = 0;
199 		tp->write_seq		   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) {
204 		struct inet_peer *peer = rt_get_peer(rt, fl4->daddr);
205 		/*
206 		 * VJ's idea. We save last timestamp seen from
207 		 * the destination in peer table, when entering state
208 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209 		 * when trying new connection.
210 		 */
211 		if (peer) {
212 			inet_peer_refcheck(peer);
213 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
214 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
215 				tp->rx_opt.ts_recent = peer->tcp_ts;
216 			}
217 		}
218 	}
219 
220 	inet->inet_dport = usin->sin_port;
221 	inet->inet_daddr = daddr;
222 
223 	inet_csk(sk)->icsk_ext_hdr_len = 0;
224 	if (inet_opt)
225 		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
226 
227 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
228 
229 	/* Socket identity is still unknown (sport may be zero).
230 	 * However we set state to SYN-SENT and not releasing socket
231 	 * lock select source port, enter ourselves into the hash tables and
232 	 * complete initialization after this.
233 	 */
234 	tcp_set_state(sk, TCP_SYN_SENT);
235 	err = inet_hash_connect(&tcp_death_row, sk);
236 	if (err)
237 		goto failure;
238 
239 	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
240 			       inet->inet_sport, inet->inet_dport, sk);
241 	if (IS_ERR(rt)) {
242 		err = PTR_ERR(rt);
243 		rt = NULL;
244 		goto failure;
245 	}
246 	/* OK, now commit destination to socket.  */
247 	sk->sk_gso_type = SKB_GSO_TCPV4;
248 	sk_setup_caps(sk, &rt->dst);
249 
250 	if (!tp->write_seq)
251 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
252 							   inet->inet_daddr,
253 							   inet->inet_sport,
254 							   usin->sin_port);
255 
256 	inet->inet_id = tp->write_seq ^ jiffies;
257 
258 	err = tcp_connect(sk);
259 	rt = NULL;
260 	if (err)
261 		goto failure;
262 
263 	return 0;
264 
265 failure:
266 	/*
267 	 * This unhashes the socket and releases the local port,
268 	 * if necessary.
269 	 */
270 	tcp_set_state(sk, TCP_CLOSE);
271 	ip_rt_put(rt);
272 	sk->sk_route_caps = 0;
273 	inet->inet_dport = 0;
274 	return err;
275 }
276 EXPORT_SYMBOL(tcp_v4_connect);
277 
278 /*
279  * This routine does path mtu discovery as defined in RFC1191.
280  */
do_pmtu_discovery(struct sock * sk,const struct iphdr * iph,u32 mtu)281 static void do_pmtu_discovery(struct sock *sk, const struct iphdr *iph, u32 mtu)
282 {
283 	struct dst_entry *dst;
284 	struct inet_sock *inet = inet_sk(sk);
285 
286 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
287 	 * send out by Linux are always <576bytes so they should go through
288 	 * unfragmented).
289 	 */
290 	if (sk->sk_state == TCP_LISTEN)
291 		return;
292 
293 	/* We don't check in the destentry if pmtu discovery is forbidden
294 	 * on this route. We just assume that no packet_to_big packets
295 	 * are send back when pmtu discovery is not active.
296 	 * There is a small race when the user changes this flag in the
297 	 * route, but I think that's acceptable.
298 	 */
299 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
300 		return;
301 
302 	dst->ops->update_pmtu(dst, mtu);
303 
304 	/* Something is about to be wrong... Remember soft error
305 	 * for the case, if this connection will not able to recover.
306 	 */
307 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
308 		sk->sk_err_soft = EMSGSIZE;
309 
310 	mtu = dst_mtu(dst);
311 
312 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
313 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
314 		tcp_sync_mss(sk, mtu);
315 
316 		/* Resend the TCP packet because it's
317 		 * clear that the old packet has been
318 		 * dropped. This is the new "fast" path mtu
319 		 * discovery.
320 		 */
321 		tcp_simple_retransmit(sk);
322 	} /* else let the usual retransmit timer handle it */
323 }
324 
325 /*
326  * This routine is called by the ICMP module when it gets some
327  * sort of error condition.  If err < 0 then the socket should
328  * be closed and the error returned to the user.  If err > 0
329  * it's just the icmp type << 8 | icmp code.  After adjustment
330  * header points to the first 8 bytes of the tcp header.  We need
331  * to find the appropriate port.
332  *
333  * The locking strategy used here is very "optimistic". When
334  * someone else accesses the socket the ICMP is just dropped
335  * and for some paths there is no check at all.
336  * A more general error queue to queue errors for later handling
337  * is probably better.
338  *
339  */
340 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)341 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
342 {
343 	const struct iphdr *iph = (const struct iphdr *)icmp_skb->data;
344 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
345 	struct inet_connection_sock *icsk;
346 	struct tcp_sock *tp;
347 	struct inet_sock *inet;
348 	const int type = icmp_hdr(icmp_skb)->type;
349 	const int code = icmp_hdr(icmp_skb)->code;
350 	struct sock *sk;
351 	struct sk_buff *skb;
352 	__u32 seq;
353 	__u32 remaining;
354 	int err;
355 	struct net *net = dev_net(icmp_skb->dev);
356 
357 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
358 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
359 		return;
360 	}
361 
362 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
363 			iph->saddr, th->source, inet_iif(icmp_skb));
364 	if (!sk) {
365 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
366 		return;
367 	}
368 	if (sk->sk_state == TCP_TIME_WAIT) {
369 		inet_twsk_put(inet_twsk(sk));
370 		return;
371 	}
372 
373 	bh_lock_sock(sk);
374 	/* If too many ICMPs get dropped on busy
375 	 * servers this needs to be solved differently.
376 	 */
377 	if (sock_owned_by_user(sk))
378 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
379 
380 	if (sk->sk_state == TCP_CLOSE)
381 		goto out;
382 
383 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
384 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
385 		goto out;
386 	}
387 
388 	icsk = inet_csk(sk);
389 	tp = tcp_sk(sk);
390 	seq = ntohl(th->seq);
391 	if (sk->sk_state != TCP_LISTEN &&
392 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
393 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
394 		goto out;
395 	}
396 
397 	switch (type) {
398 	case ICMP_SOURCE_QUENCH:
399 		/* Just silently ignore these. */
400 		goto out;
401 	case ICMP_PARAMETERPROB:
402 		err = EPROTO;
403 		break;
404 	case ICMP_DEST_UNREACH:
405 		if (code > NR_ICMP_UNREACH)
406 			goto out;
407 
408 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
409 			if (!sock_owned_by_user(sk))
410 				do_pmtu_discovery(sk, iph, info);
411 			goto out;
412 		}
413 
414 		err = icmp_err_convert[code].errno;
415 		/* check if icmp_skb allows revert of backoff
416 		 * (see draft-zimmermann-tcp-lcd) */
417 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
418 			break;
419 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
420 		    !icsk->icsk_backoff)
421 			break;
422 
423 		if (sock_owned_by_user(sk))
424 			break;
425 
426 		icsk->icsk_backoff--;
427 		inet_csk(sk)->icsk_rto = (tp->srtt ? __tcp_set_rto(tp) :
428 			TCP_TIMEOUT_INIT) << icsk->icsk_backoff;
429 		tcp_bound_rto(sk);
430 
431 		skb = tcp_write_queue_head(sk);
432 		BUG_ON(!skb);
433 
434 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
435 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
436 
437 		if (remaining) {
438 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
439 						  remaining, TCP_RTO_MAX);
440 		} else {
441 			/* RTO revert clocked out retransmission.
442 			 * Will retransmit now */
443 			tcp_retransmit_timer(sk);
444 		}
445 
446 		break;
447 	case ICMP_TIME_EXCEEDED:
448 		err = EHOSTUNREACH;
449 		break;
450 	default:
451 		goto out;
452 	}
453 
454 	switch (sk->sk_state) {
455 		struct request_sock *req, **prev;
456 	case TCP_LISTEN:
457 		if (sock_owned_by_user(sk))
458 			goto out;
459 
460 		req = inet_csk_search_req(sk, &prev, th->dest,
461 					  iph->daddr, iph->saddr);
462 		if (!req)
463 			goto out;
464 
465 		/* ICMPs are not backlogged, hence we cannot get
466 		   an established socket here.
467 		 */
468 		WARN_ON(req->sk);
469 
470 		if (seq != tcp_rsk(req)->snt_isn) {
471 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
472 			goto out;
473 		}
474 
475 		/*
476 		 * Still in SYN_RECV, just remove it silently.
477 		 * There is no good way to pass the error to the newly
478 		 * created socket, and POSIX does not want network
479 		 * errors returned from accept().
480 		 */
481 		inet_csk_reqsk_queue_drop(sk, req, prev);
482 		goto out;
483 
484 	case TCP_SYN_SENT:
485 	case TCP_SYN_RECV:  /* Cannot happen.
486 			       It can f.e. if SYNs crossed.
487 			     */
488 		if (!sock_owned_by_user(sk)) {
489 			sk->sk_err = err;
490 
491 			sk->sk_error_report(sk);
492 
493 			tcp_done(sk);
494 		} else {
495 			sk->sk_err_soft = err;
496 		}
497 		goto out;
498 	}
499 
500 	/* If we've already connected we will keep trying
501 	 * until we time out, or the user gives up.
502 	 *
503 	 * rfc1122 4.2.3.9 allows to consider as hard errors
504 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
505 	 * but it is obsoleted by pmtu discovery).
506 	 *
507 	 * Note, that in modern internet, where routing is unreliable
508 	 * and in each dark corner broken firewalls sit, sending random
509 	 * errors ordered by their masters even this two messages finally lose
510 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
511 	 *
512 	 * Now we are in compliance with RFCs.
513 	 *							--ANK (980905)
514 	 */
515 
516 	inet = inet_sk(sk);
517 	if (!sock_owned_by_user(sk) && inet->recverr) {
518 		sk->sk_err = err;
519 		sk->sk_error_report(sk);
520 	} else	{ /* Only an error on timeout */
521 		sk->sk_err_soft = err;
522 	}
523 
524 out:
525 	bh_unlock_sock(sk);
526 	sock_put(sk);
527 }
528 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)529 static void __tcp_v4_send_check(struct sk_buff *skb,
530 				__be32 saddr, __be32 daddr)
531 {
532 	struct tcphdr *th = tcp_hdr(skb);
533 
534 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
535 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
536 		skb->csum_start = skb_transport_header(skb) - skb->head;
537 		skb->csum_offset = offsetof(struct tcphdr, check);
538 	} else {
539 		th->check = tcp_v4_check(skb->len, saddr, daddr,
540 					 csum_partial(th,
541 						      th->doff << 2,
542 						      skb->csum));
543 	}
544 }
545 
546 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)547 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
548 {
549 	const struct inet_sock *inet = inet_sk(sk);
550 
551 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
552 }
553 EXPORT_SYMBOL(tcp_v4_send_check);
554 
tcp_v4_gso_send_check(struct sk_buff * skb)555 int tcp_v4_gso_send_check(struct sk_buff *skb)
556 {
557 	const struct iphdr *iph;
558 	struct tcphdr *th;
559 
560 	if (!pskb_may_pull(skb, sizeof(*th)))
561 		return -EINVAL;
562 
563 	iph = ip_hdr(skb);
564 	th = tcp_hdr(skb);
565 
566 	th->check = 0;
567 	skb->ip_summed = CHECKSUM_PARTIAL;
568 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
569 	return 0;
570 }
571 
572 /*
573  *	This routine will send an RST to the other tcp.
574  *
575  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
576  *		      for reset.
577  *	Answer: if a packet caused RST, it is not for a socket
578  *		existing in our system, if it is matched to a socket,
579  *		it is just duplicate segment or bug in other side's TCP.
580  *		So that we build reply only basing on parameters
581  *		arrived with segment.
582  *	Exception: precedence violation. We do not implement it in any case.
583  */
584 
tcp_v4_send_reset(struct sock * sk,struct sk_buff * skb)585 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
586 {
587 	const struct tcphdr *th = tcp_hdr(skb);
588 	struct {
589 		struct tcphdr th;
590 #ifdef CONFIG_TCP_MD5SIG
591 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
592 #endif
593 	} rep;
594 	struct ip_reply_arg arg;
595 #ifdef CONFIG_TCP_MD5SIG
596 	struct tcp_md5sig_key *key;
597 	const __u8 *hash_location = NULL;
598 	unsigned char newhash[16];
599 	int genhash;
600 	struct sock *sk1 = NULL;
601 #endif
602 	struct net *net;
603 
604 	/* Never send a reset in response to a reset. */
605 	if (th->rst)
606 		return;
607 
608 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
609 		return;
610 
611 	/* Swap the send and the receive. */
612 	memset(&rep, 0, sizeof(rep));
613 	rep.th.dest   = th->source;
614 	rep.th.source = th->dest;
615 	rep.th.doff   = sizeof(struct tcphdr) / 4;
616 	rep.th.rst    = 1;
617 
618 	if (th->ack) {
619 		rep.th.seq = th->ack_seq;
620 	} else {
621 		rep.th.ack = 1;
622 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
623 				       skb->len - (th->doff << 2));
624 	}
625 
626 	memset(&arg, 0, sizeof(arg));
627 	arg.iov[0].iov_base = (unsigned char *)&rep;
628 	arg.iov[0].iov_len  = sizeof(rep.th);
629 
630 #ifdef CONFIG_TCP_MD5SIG
631 	hash_location = tcp_parse_md5sig_option(th);
632 	if (!sk && hash_location) {
633 		/*
634 		 * active side is lost. Try to find listening socket through
635 		 * source port, and then find md5 key through listening socket.
636 		 * we are not loose security here:
637 		 * Incoming packet is checked with md5 hash with finding key,
638 		 * no RST generated if md5 hash doesn't match.
639 		 */
640 		sk1 = __inet_lookup_listener(dev_net(skb_dst(skb)->dev),
641 					     &tcp_hashinfo, ip_hdr(skb)->daddr,
642 					     ntohs(th->source), inet_iif(skb));
643 		/* don't send rst if it can't find key */
644 		if (!sk1)
645 			return;
646 		rcu_read_lock();
647 		key = tcp_md5_do_lookup(sk1, (union tcp_md5_addr *)
648 					&ip_hdr(skb)->saddr, AF_INET);
649 		if (!key)
650 			goto release_sk1;
651 
652 		genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, NULL, skb);
653 		if (genhash || memcmp(hash_location, newhash, 16) != 0)
654 			goto release_sk1;
655 	} else {
656 		key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
657 					     &ip_hdr(skb)->saddr,
658 					     AF_INET) : NULL;
659 	}
660 
661 	if (key) {
662 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
663 				   (TCPOPT_NOP << 16) |
664 				   (TCPOPT_MD5SIG << 8) |
665 				   TCPOLEN_MD5SIG);
666 		/* Update length and the length the header thinks exists */
667 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
668 		rep.th.doff = arg.iov[0].iov_len / 4;
669 
670 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
671 				     key, ip_hdr(skb)->saddr,
672 				     ip_hdr(skb)->daddr, &rep.th);
673 	}
674 #endif
675 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
676 				      ip_hdr(skb)->saddr, /* XXX */
677 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
678 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
679 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
680 	/* When socket is gone, all binding information is lost.
681 	 * routing might fail in this case. No choice here, if we choose to force
682 	 * input interface, we will misroute in case of asymmetric route.
683 	 */
684 	if (sk)
685 		arg.bound_dev_if = sk->sk_bound_dev_if;
686 
687 	net = dev_net(skb_dst(skb)->dev);
688 	arg.tos = ip_hdr(skb)->tos;
689 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
690 		      &arg, arg.iov[0].iov_len);
691 
692 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
693 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
694 
695 #ifdef CONFIG_TCP_MD5SIG
696 release_sk1:
697 	if (sk1) {
698 		rcu_read_unlock();
699 		sock_put(sk1);
700 	}
701 #endif
702 }
703 
704 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
705    outside socket context is ugly, certainly. What can I do?
706  */
707 
tcp_v4_send_ack(struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 ts,int oif,struct tcp_md5sig_key * key,int reply_flags,u8 tos)708 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
709 			    u32 win, u32 ts, int oif,
710 			    struct tcp_md5sig_key *key,
711 			    int reply_flags, u8 tos)
712 {
713 	const struct tcphdr *th = tcp_hdr(skb);
714 	struct {
715 		struct tcphdr th;
716 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
717 #ifdef CONFIG_TCP_MD5SIG
718 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
719 #endif
720 			];
721 	} rep;
722 	struct ip_reply_arg arg;
723 	struct net *net = dev_net(skb_dst(skb)->dev);
724 
725 	memset(&rep.th, 0, sizeof(struct tcphdr));
726 	memset(&arg, 0, sizeof(arg));
727 
728 	arg.iov[0].iov_base = (unsigned char *)&rep;
729 	arg.iov[0].iov_len  = sizeof(rep.th);
730 	if (ts) {
731 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
732 				   (TCPOPT_TIMESTAMP << 8) |
733 				   TCPOLEN_TIMESTAMP);
734 		rep.opt[1] = htonl(tcp_time_stamp);
735 		rep.opt[2] = htonl(ts);
736 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
737 	}
738 
739 	/* Swap the send and the receive. */
740 	rep.th.dest    = th->source;
741 	rep.th.source  = th->dest;
742 	rep.th.doff    = arg.iov[0].iov_len / 4;
743 	rep.th.seq     = htonl(seq);
744 	rep.th.ack_seq = htonl(ack);
745 	rep.th.ack     = 1;
746 	rep.th.window  = htons(win);
747 
748 #ifdef CONFIG_TCP_MD5SIG
749 	if (key) {
750 		int offset = (ts) ? 3 : 0;
751 
752 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
753 					  (TCPOPT_NOP << 16) |
754 					  (TCPOPT_MD5SIG << 8) |
755 					  TCPOLEN_MD5SIG);
756 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
757 		rep.th.doff = arg.iov[0].iov_len/4;
758 
759 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
760 				    key, ip_hdr(skb)->saddr,
761 				    ip_hdr(skb)->daddr, &rep.th);
762 	}
763 #endif
764 	arg.flags = reply_flags;
765 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
766 				      ip_hdr(skb)->saddr, /* XXX */
767 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
768 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
769 	if (oif)
770 		arg.bound_dev_if = oif;
771 	arg.tos = tos;
772 	ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr,
773 		      &arg, arg.iov[0].iov_len);
774 
775 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
776 }
777 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)778 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
779 {
780 	struct inet_timewait_sock *tw = inet_twsk(sk);
781 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
782 
783 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
784 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
785 			tcptw->tw_ts_recent,
786 			tw->tw_bound_dev_if,
787 			tcp_twsk_md5_key(tcptw),
788 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0,
789 			tw->tw_tos
790 			);
791 
792 	inet_twsk_put(tw);
793 }
794 
tcp_v4_reqsk_send_ack(struct sock * sk,struct sk_buff * skb,struct request_sock * req)795 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
796 				  struct request_sock *req)
797 {
798 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
799 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
800 			req->ts_recent,
801 			0,
802 			tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
803 					  AF_INET),
804 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
805 			ip_hdr(skb)->tos);
806 }
807 
808 /*
809  *	Send a SYN-ACK after having received a SYN.
810  *	This still operates on a request_sock only, not on a big
811  *	socket.
812  */
tcp_v4_send_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,struct request_values * rvp)813 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
814 			      struct request_sock *req,
815 			      struct request_values *rvp)
816 {
817 	const struct inet_request_sock *ireq = inet_rsk(req);
818 	struct flowi4 fl4;
819 	int err = -1;
820 	struct sk_buff * skb;
821 
822 	/* First, grab a route. */
823 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
824 		return -1;
825 
826 	skb = tcp_make_synack(sk, dst, req, rvp);
827 
828 	if (skb) {
829 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
830 
831 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
832 					    ireq->rmt_addr,
833 					    ireq->opt);
834 		err = net_xmit_eval(err);
835 	}
836 
837 	dst_release(dst);
838 	return err;
839 }
840 
tcp_v4_rtx_synack(struct sock * sk,struct request_sock * req,struct request_values * rvp)841 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
842 			      struct request_values *rvp)
843 {
844 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
845 	return tcp_v4_send_synack(sk, NULL, req, rvp);
846 }
847 
848 /*
849  *	IPv4 request_sock destructor.
850  */
tcp_v4_reqsk_destructor(struct request_sock * req)851 static void tcp_v4_reqsk_destructor(struct request_sock *req)
852 {
853 	kfree(inet_rsk(req)->opt);
854 }
855 
856 /*
857  * Return 1 if a syncookie should be sent
858  */
tcp_syn_flood_action(struct sock * sk,const struct sk_buff * skb,const char * proto)859 int tcp_syn_flood_action(struct sock *sk,
860 			 const struct sk_buff *skb,
861 			 const char *proto)
862 {
863 	const char *msg = "Dropping request";
864 	int want_cookie = 0;
865 	struct listen_sock *lopt;
866 
867 
868 
869 #ifdef CONFIG_SYN_COOKIES
870 	if (sysctl_tcp_syncookies) {
871 		msg = "Sending cookies";
872 		want_cookie = 1;
873 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
874 	} else
875 #endif
876 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
877 
878 	lopt = inet_csk(sk)->icsk_accept_queue.listen_opt;
879 	if (!lopt->synflood_warned) {
880 		lopt->synflood_warned = 1;
881 		pr_info("%s: Possible SYN flooding on port %d. %s.  Check SNMP counters.\n",
882 			proto, ntohs(tcp_hdr(skb)->dest), msg);
883 	}
884 	return want_cookie;
885 }
886 EXPORT_SYMBOL(tcp_syn_flood_action);
887 
888 /*
889  * Save and compile IPv4 options into the request_sock if needed.
890  */
tcp_v4_save_options(struct sock * sk,struct sk_buff * skb)891 static struct ip_options_rcu *tcp_v4_save_options(struct sock *sk,
892 						  struct sk_buff *skb)
893 {
894 	const struct ip_options *opt = &(IPCB(skb)->opt);
895 	struct ip_options_rcu *dopt = NULL;
896 
897 	if (opt && opt->optlen) {
898 		int opt_size = sizeof(*dopt) + opt->optlen;
899 
900 		dopt = kmalloc(opt_size, GFP_ATOMIC);
901 		if (dopt) {
902 			if (ip_options_echo(&dopt->opt, skb)) {
903 				kfree(dopt);
904 				dopt = NULL;
905 			}
906 		}
907 	}
908 	return dopt;
909 }
910 
911 #ifdef CONFIG_TCP_MD5SIG
912 /*
913  * RFC2385 MD5 checksumming requires a mapping of
914  * IP address->MD5 Key.
915  * We need to maintain these in the sk structure.
916  */
917 
918 /* Find the Key structure for an address.  */
tcp_md5_do_lookup(struct sock * sk,const union tcp_md5_addr * addr,int family)919 struct tcp_md5sig_key *tcp_md5_do_lookup(struct sock *sk,
920 					 const union tcp_md5_addr *addr,
921 					 int family)
922 {
923 	struct tcp_sock *tp = tcp_sk(sk);
924 	struct tcp_md5sig_key *key;
925 	struct hlist_node *pos;
926 	unsigned int size = sizeof(struct in_addr);
927 	struct tcp_md5sig_info *md5sig;
928 
929 	/* caller either holds rcu_read_lock() or socket lock */
930 	md5sig = rcu_dereference_check(tp->md5sig_info,
931 				       sock_owned_by_user(sk) ||
932 				       lockdep_is_held(&sk->sk_lock.slock));
933 	if (!md5sig)
934 		return NULL;
935 #if IS_ENABLED(CONFIG_IPV6)
936 	if (family == AF_INET6)
937 		size = sizeof(struct in6_addr);
938 #endif
939 	hlist_for_each_entry_rcu(key, pos, &md5sig->head, node) {
940 		if (key->family != family)
941 			continue;
942 		if (!memcmp(&key->addr, addr, size))
943 			return key;
944 	}
945 	return NULL;
946 }
947 EXPORT_SYMBOL(tcp_md5_do_lookup);
948 
tcp_v4_md5_lookup(struct sock * sk,struct sock * addr_sk)949 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
950 					 struct sock *addr_sk)
951 {
952 	union tcp_md5_addr *addr;
953 
954 	addr = (union tcp_md5_addr *)&inet_sk(addr_sk)->inet_daddr;
955 	return tcp_md5_do_lookup(sk, addr, AF_INET);
956 }
957 EXPORT_SYMBOL(tcp_v4_md5_lookup);
958 
tcp_v4_reqsk_md5_lookup(struct sock * sk,struct request_sock * req)959 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
960 						      struct request_sock *req)
961 {
962 	union tcp_md5_addr *addr;
963 
964 	addr = (union tcp_md5_addr *)&inet_rsk(req)->rmt_addr;
965 	return tcp_md5_do_lookup(sk, addr, AF_INET);
966 }
967 
968 /* This can be called on a newly created socket, from other files */
tcp_md5_do_add(struct sock * sk,const union tcp_md5_addr * addr,int family,const u8 * newkey,u8 newkeylen,gfp_t gfp)969 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
970 		   int family, const u8 *newkey, u8 newkeylen, gfp_t gfp)
971 {
972 	/* Add Key to the list */
973 	struct tcp_md5sig_key *key;
974 	struct tcp_sock *tp = tcp_sk(sk);
975 	struct tcp_md5sig_info *md5sig;
976 
977 	key = tcp_md5_do_lookup(sk, addr, family);
978 	if (key) {
979 		/* Pre-existing entry - just update that one. */
980 		memcpy(key->key, newkey, newkeylen);
981 		key->keylen = newkeylen;
982 		return 0;
983 	}
984 
985 	md5sig = rcu_dereference_protected(tp->md5sig_info,
986 					   sock_owned_by_user(sk));
987 	if (!md5sig) {
988 		md5sig = kmalloc(sizeof(*md5sig), gfp);
989 		if (!md5sig)
990 			return -ENOMEM;
991 
992 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
993 		INIT_HLIST_HEAD(&md5sig->head);
994 		rcu_assign_pointer(tp->md5sig_info, md5sig);
995 	}
996 
997 	key = sock_kmalloc(sk, sizeof(*key), gfp);
998 	if (!key)
999 		return -ENOMEM;
1000 	if (hlist_empty(&md5sig->head) && !tcp_alloc_md5sig_pool(sk)) {
1001 		sock_kfree_s(sk, key, sizeof(*key));
1002 		return -ENOMEM;
1003 	}
1004 
1005 	memcpy(key->key, newkey, newkeylen);
1006 	key->keylen = newkeylen;
1007 	key->family = family;
1008 	memcpy(&key->addr, addr,
1009 	       (family == AF_INET6) ? sizeof(struct in6_addr) :
1010 				      sizeof(struct in_addr));
1011 	hlist_add_head_rcu(&key->node, &md5sig->head);
1012 	return 0;
1013 }
1014 EXPORT_SYMBOL(tcp_md5_do_add);
1015 
tcp_md5_do_del(struct sock * sk,const union tcp_md5_addr * addr,int family)1016 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family)
1017 {
1018 	struct tcp_sock *tp = tcp_sk(sk);
1019 	struct tcp_md5sig_key *key;
1020 	struct tcp_md5sig_info *md5sig;
1021 
1022 	key = tcp_md5_do_lookup(sk, addr, family);
1023 	if (!key)
1024 		return -ENOENT;
1025 	hlist_del_rcu(&key->node);
1026 	atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1027 	kfree_rcu(key, rcu);
1028 	md5sig = rcu_dereference_protected(tp->md5sig_info,
1029 					   sock_owned_by_user(sk));
1030 	if (hlist_empty(&md5sig->head))
1031 		tcp_free_md5sig_pool();
1032 	return 0;
1033 }
1034 EXPORT_SYMBOL(tcp_md5_do_del);
1035 
tcp_clear_md5_list(struct sock * sk)1036 void tcp_clear_md5_list(struct sock *sk)
1037 {
1038 	struct tcp_sock *tp = tcp_sk(sk);
1039 	struct tcp_md5sig_key *key;
1040 	struct hlist_node *pos, *n;
1041 	struct tcp_md5sig_info *md5sig;
1042 
1043 	md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
1044 
1045 	if (!hlist_empty(&md5sig->head))
1046 		tcp_free_md5sig_pool();
1047 	hlist_for_each_entry_safe(key, pos, n, &md5sig->head, node) {
1048 		hlist_del_rcu(&key->node);
1049 		atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
1050 		kfree_rcu(key, rcu);
1051 	}
1052 }
1053 
tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)1054 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
1055 				 int optlen)
1056 {
1057 	struct tcp_md5sig cmd;
1058 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1059 
1060 	if (optlen < sizeof(cmd))
1061 		return -EINVAL;
1062 
1063 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1064 		return -EFAULT;
1065 
1066 	if (sin->sin_family != AF_INET)
1067 		return -EINVAL;
1068 
1069 	if (!cmd.tcpm_key || !cmd.tcpm_keylen)
1070 		return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1071 				      AF_INET);
1072 
1073 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1074 		return -EINVAL;
1075 
1076 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin->sin_addr.s_addr,
1077 			      AF_INET, cmd.tcpm_key, cmd.tcpm_keylen,
1078 			      GFP_KERNEL);
1079 }
1080 
tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1081 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1082 					__be32 daddr, __be32 saddr, int nbytes)
1083 {
1084 	struct tcp4_pseudohdr *bp;
1085 	struct scatterlist sg;
1086 
1087 	bp = &hp->md5_blk.ip4;
1088 
1089 	/*
1090 	 * 1. the TCP pseudo-header (in the order: source IP address,
1091 	 * destination IP address, zero-padded protocol number, and
1092 	 * segment length)
1093 	 */
1094 	bp->saddr = saddr;
1095 	bp->daddr = daddr;
1096 	bp->pad = 0;
1097 	bp->protocol = IPPROTO_TCP;
1098 	bp->len = cpu_to_be16(nbytes);
1099 
1100 	sg_init_one(&sg, bp, sizeof(*bp));
1101 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1102 }
1103 
tcp_v4_md5_hash_hdr(char * md5_hash,const struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,const struct tcphdr * th)1104 static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
1105 			       __be32 daddr, __be32 saddr, const struct tcphdr *th)
1106 {
1107 	struct tcp_md5sig_pool *hp;
1108 	struct hash_desc *desc;
1109 
1110 	hp = tcp_get_md5sig_pool();
1111 	if (!hp)
1112 		goto clear_hash_noput;
1113 	desc = &hp->md5_desc;
1114 
1115 	if (crypto_hash_init(desc))
1116 		goto clear_hash;
1117 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1118 		goto clear_hash;
1119 	if (tcp_md5_hash_header(hp, th))
1120 		goto clear_hash;
1121 	if (tcp_md5_hash_key(hp, key))
1122 		goto clear_hash;
1123 	if (crypto_hash_final(desc, md5_hash))
1124 		goto clear_hash;
1125 
1126 	tcp_put_md5sig_pool();
1127 	return 0;
1128 
1129 clear_hash:
1130 	tcp_put_md5sig_pool();
1131 clear_hash_noput:
1132 	memset(md5_hash, 0, 16);
1133 	return 1;
1134 }
1135 
tcp_v4_md5_hash_skb(char * md5_hash,struct tcp_md5sig_key * key,const struct sock * sk,const struct request_sock * req,const struct sk_buff * skb)1136 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1137 			const struct sock *sk, const struct request_sock *req,
1138 			const struct sk_buff *skb)
1139 {
1140 	struct tcp_md5sig_pool *hp;
1141 	struct hash_desc *desc;
1142 	const struct tcphdr *th = tcp_hdr(skb);
1143 	__be32 saddr, daddr;
1144 
1145 	if (sk) {
1146 		saddr = inet_sk(sk)->inet_saddr;
1147 		daddr = inet_sk(sk)->inet_daddr;
1148 	} else if (req) {
1149 		saddr = inet_rsk(req)->loc_addr;
1150 		daddr = inet_rsk(req)->rmt_addr;
1151 	} else {
1152 		const struct iphdr *iph = ip_hdr(skb);
1153 		saddr = iph->saddr;
1154 		daddr = iph->daddr;
1155 	}
1156 
1157 	hp = tcp_get_md5sig_pool();
1158 	if (!hp)
1159 		goto clear_hash_noput;
1160 	desc = &hp->md5_desc;
1161 
1162 	if (crypto_hash_init(desc))
1163 		goto clear_hash;
1164 
1165 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1166 		goto clear_hash;
1167 	if (tcp_md5_hash_header(hp, th))
1168 		goto clear_hash;
1169 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1170 		goto clear_hash;
1171 	if (tcp_md5_hash_key(hp, key))
1172 		goto clear_hash;
1173 	if (crypto_hash_final(desc, md5_hash))
1174 		goto clear_hash;
1175 
1176 	tcp_put_md5sig_pool();
1177 	return 0;
1178 
1179 clear_hash:
1180 	tcp_put_md5sig_pool();
1181 clear_hash_noput:
1182 	memset(md5_hash, 0, 16);
1183 	return 1;
1184 }
1185 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1186 
tcp_v4_inbound_md5_hash(struct sock * sk,const struct sk_buff * skb)1187 static int tcp_v4_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
1188 {
1189 	/*
1190 	 * This gets called for each TCP segment that arrives
1191 	 * so we want to be efficient.
1192 	 * We have 3 drop cases:
1193 	 * o No MD5 hash and one expected.
1194 	 * o MD5 hash and we're not expecting one.
1195 	 * o MD5 hash and its wrong.
1196 	 */
1197 	const __u8 *hash_location = NULL;
1198 	struct tcp_md5sig_key *hash_expected;
1199 	const struct iphdr *iph = ip_hdr(skb);
1200 	const struct tcphdr *th = tcp_hdr(skb);
1201 	int genhash;
1202 	unsigned char newhash[16];
1203 
1204 	hash_expected = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&iph->saddr,
1205 					  AF_INET);
1206 	hash_location = tcp_parse_md5sig_option(th);
1207 
1208 	/* We've parsed the options - do we have a hash? */
1209 	if (!hash_expected && !hash_location)
1210 		return 0;
1211 
1212 	if (hash_expected && !hash_location) {
1213 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1214 		return 1;
1215 	}
1216 
1217 	if (!hash_expected && hash_location) {
1218 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1219 		return 1;
1220 	}
1221 
1222 	/* Okay, so this is hash_expected and hash_location -
1223 	 * so we need to calculate the checksum.
1224 	 */
1225 	genhash = tcp_v4_md5_hash_skb(newhash,
1226 				      hash_expected,
1227 				      NULL, NULL, skb);
1228 
1229 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1230 		if (net_ratelimit()) {
1231 			pr_info("MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1232 				&iph->saddr, ntohs(th->source),
1233 				&iph->daddr, ntohs(th->dest),
1234 				genhash ? " tcp_v4_calc_md5_hash failed" : "");
1235 		}
1236 		return 1;
1237 	}
1238 	return 0;
1239 }
1240 
1241 #endif
1242 
1243 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1244 	.family		=	PF_INET,
1245 	.obj_size	=	sizeof(struct tcp_request_sock),
1246 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1247 	.send_ack	=	tcp_v4_reqsk_send_ack,
1248 	.destructor	=	tcp_v4_reqsk_destructor,
1249 	.send_reset	=	tcp_v4_send_reset,
1250 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1251 };
1252 
1253 #ifdef CONFIG_TCP_MD5SIG
1254 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1255 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1256 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1257 };
1258 #endif
1259 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1260 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1261 {
1262 	struct tcp_extend_values tmp_ext;
1263 	struct tcp_options_received tmp_opt;
1264 	const u8 *hash_location;
1265 	struct request_sock *req;
1266 	struct inet_request_sock *ireq;
1267 	struct tcp_sock *tp = tcp_sk(sk);
1268 	struct dst_entry *dst = NULL;
1269 	__be32 saddr = ip_hdr(skb)->saddr;
1270 	__be32 daddr = ip_hdr(skb)->daddr;
1271 	__u32 isn = TCP_SKB_CB(skb)->when;
1272 	int want_cookie = 0;
1273 
1274 	/* Never answer to SYNs send to broadcast or multicast */
1275 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1276 		goto drop;
1277 
1278 	/* TW buckets are converted to open requests without
1279 	 * limitations, they conserve resources and peer is
1280 	 * evidently real one.
1281 	 */
1282 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1283 		want_cookie = tcp_syn_flood_action(sk, skb, "TCP");
1284 		if (!want_cookie)
1285 			goto drop;
1286 	}
1287 
1288 	/* Accept backlog is full. If we have already queued enough
1289 	 * of warm entries in syn queue, drop request. It is better than
1290 	 * clogging syn queue with openreqs with exponentially increasing
1291 	 * timeout.
1292 	 */
1293 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1294 		goto drop;
1295 
1296 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1297 	if (!req)
1298 		goto drop;
1299 
1300 #ifdef CONFIG_TCP_MD5SIG
1301 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1302 #endif
1303 
1304 	tcp_clear_options(&tmp_opt);
1305 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1306 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1307 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1308 
1309 	if (tmp_opt.cookie_plus > 0 &&
1310 	    tmp_opt.saw_tstamp &&
1311 	    !tp->rx_opt.cookie_out_never &&
1312 	    (sysctl_tcp_cookie_size > 0 ||
1313 	     (tp->cookie_values != NULL &&
1314 	      tp->cookie_values->cookie_desired > 0))) {
1315 		u8 *c;
1316 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1317 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1318 
1319 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1320 			goto drop_and_release;
1321 
1322 		/* Secret recipe starts with IP addresses */
1323 		*mess++ ^= (__force u32)daddr;
1324 		*mess++ ^= (__force u32)saddr;
1325 
1326 		/* plus variable length Initiator Cookie */
1327 		c = (u8 *)mess;
1328 		while (l-- > 0)
1329 			*c++ ^= *hash_location++;
1330 
1331 		want_cookie = 0;	/* not our kind of cookie */
1332 		tmp_ext.cookie_out_never = 0; /* false */
1333 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1334 	} else if (!tp->rx_opt.cookie_in_always) {
1335 		/* redundant indications, but ensure initialization. */
1336 		tmp_ext.cookie_out_never = 1; /* true */
1337 		tmp_ext.cookie_plus = 0;
1338 	} else {
1339 		goto drop_and_release;
1340 	}
1341 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1342 
1343 	if (want_cookie && !tmp_opt.saw_tstamp)
1344 		tcp_clear_options(&tmp_opt);
1345 
1346 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1347 	tcp_openreq_init(req, &tmp_opt, skb);
1348 
1349 	ireq = inet_rsk(req);
1350 	ireq->loc_addr = daddr;
1351 	ireq->rmt_addr = saddr;
1352 	ireq->no_srccheck = inet_sk(sk)->transparent;
1353 	ireq->opt = tcp_v4_save_options(sk, skb);
1354 
1355 	if (security_inet_conn_request(sk, skb, req))
1356 		goto drop_and_free;
1357 
1358 	if (!want_cookie || tmp_opt.tstamp_ok)
1359 		TCP_ECN_create_request(req, tcp_hdr(skb));
1360 
1361 	if (want_cookie) {
1362 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1363 		req->cookie_ts = tmp_opt.tstamp_ok;
1364 	} else if (!isn) {
1365 		struct inet_peer *peer = NULL;
1366 		struct flowi4 fl4;
1367 
1368 		/* VJ's idea. We save last timestamp seen
1369 		 * from the destination in peer table, when entering
1370 		 * state TIME-WAIT, and check against it before
1371 		 * accepting new connection request.
1372 		 *
1373 		 * If "isn" is not zero, this request hit alive
1374 		 * timewait bucket, so that all the necessary checks
1375 		 * are made in the function processing timewait state.
1376 		 */
1377 		if (tmp_opt.saw_tstamp &&
1378 		    tcp_death_row.sysctl_tw_recycle &&
1379 		    (dst = inet_csk_route_req(sk, &fl4, req)) != NULL &&
1380 		    fl4.daddr == saddr &&
1381 		    (peer = rt_get_peer((struct rtable *)dst, fl4.daddr)) != NULL) {
1382 			inet_peer_refcheck(peer);
1383 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1384 			    (s32)(peer->tcp_ts - req->ts_recent) >
1385 							TCP_PAWS_WINDOW) {
1386 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1387 				goto drop_and_release;
1388 			}
1389 		}
1390 		/* Kill the following clause, if you dislike this way. */
1391 		else if (!sysctl_tcp_syncookies &&
1392 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1393 			  (sysctl_max_syn_backlog >> 2)) &&
1394 			 (!peer || !peer->tcp_ts_stamp) &&
1395 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1396 			/* Without syncookies last quarter of
1397 			 * backlog is filled with destinations,
1398 			 * proven to be alive.
1399 			 * It means that we continue to communicate
1400 			 * to destinations, already remembered
1401 			 * to the moment of synflood.
1402 			 */
1403 			LIMIT_NETDEBUG(KERN_DEBUG pr_fmt("drop open request from %pI4/%u\n"),
1404 				       &saddr, ntohs(tcp_hdr(skb)->source));
1405 			goto drop_and_release;
1406 		}
1407 
1408 		isn = tcp_v4_init_sequence(skb);
1409 	}
1410 	tcp_rsk(req)->snt_isn = isn;
1411 	tcp_rsk(req)->snt_synack = tcp_time_stamp;
1412 
1413 	if (tcp_v4_send_synack(sk, dst, req,
1414 			       (struct request_values *)&tmp_ext) ||
1415 	    want_cookie)
1416 		goto drop_and_free;
1417 
1418 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1419 	return 0;
1420 
1421 drop_and_release:
1422 	dst_release(dst);
1423 drop_and_free:
1424 	reqsk_free(req);
1425 drop:
1426 	return 0;
1427 }
1428 EXPORT_SYMBOL(tcp_v4_conn_request);
1429 
1430 
1431 /*
1432  * The three way handshake has completed - we got a valid synack -
1433  * now create the new socket.
1434  */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst)1435 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1436 				  struct request_sock *req,
1437 				  struct dst_entry *dst)
1438 {
1439 	struct inet_request_sock *ireq;
1440 	struct inet_sock *newinet;
1441 	struct tcp_sock *newtp;
1442 	struct sock *newsk;
1443 #ifdef CONFIG_TCP_MD5SIG
1444 	struct tcp_md5sig_key *key;
1445 #endif
1446 	struct ip_options_rcu *inet_opt;
1447 
1448 	if (sk_acceptq_is_full(sk))
1449 		goto exit_overflow;
1450 
1451 	newsk = tcp_create_openreq_child(sk, req, skb);
1452 	if (!newsk)
1453 		goto exit_nonewsk;
1454 
1455 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1456 
1457 	newtp		      = tcp_sk(newsk);
1458 	newinet		      = inet_sk(newsk);
1459 	ireq		      = inet_rsk(req);
1460 	newinet->inet_daddr   = ireq->rmt_addr;
1461 	newinet->inet_rcv_saddr = ireq->loc_addr;
1462 	newinet->inet_saddr	      = ireq->loc_addr;
1463 	inet_opt	      = ireq->opt;
1464 	rcu_assign_pointer(newinet->inet_opt, inet_opt);
1465 	ireq->opt	      = NULL;
1466 	newinet->mc_index     = inet_iif(skb);
1467 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1468 	newinet->rcv_tos      = ip_hdr(skb)->tos;
1469 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1470 	if (inet_opt)
1471 		inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
1472 	newinet->inet_id = newtp->write_seq ^ jiffies;
1473 
1474 	if (!dst) {
1475 		dst = inet_csk_route_child_sock(sk, newsk, req);
1476 		if (!dst)
1477 			goto put_and_exit;
1478 	} else {
1479 		/* syncookie case : see end of cookie_v4_check() */
1480 	}
1481 	sk_setup_caps(newsk, dst);
1482 
1483 	tcp_mtup_init(newsk);
1484 	tcp_sync_mss(newsk, dst_mtu(dst));
1485 	newtp->advmss = dst_metric_advmss(dst);
1486 	if (tcp_sk(sk)->rx_opt.user_mss &&
1487 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1488 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1489 
1490 	tcp_initialize_rcv_mss(newsk);
1491 	if (tcp_rsk(req)->snt_synack)
1492 		tcp_valid_rtt_meas(newsk,
1493 		    tcp_time_stamp - tcp_rsk(req)->snt_synack);
1494 	newtp->total_retrans = req->retrans;
1495 
1496 #ifdef CONFIG_TCP_MD5SIG
1497 	/* Copy over the MD5 key from the original socket */
1498 	key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&newinet->inet_daddr,
1499 				AF_INET);
1500 	if (key != NULL) {
1501 		/*
1502 		 * We're using one, so create a matching key
1503 		 * on the newsk structure. If we fail to get
1504 		 * memory, then we end up not copying the key
1505 		 * across. Shucks.
1506 		 */
1507 		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newinet->inet_daddr,
1508 			       AF_INET, key->key, key->keylen, GFP_ATOMIC);
1509 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1510 	}
1511 #endif
1512 
1513 	if (__inet_inherit_port(sk, newsk) < 0)
1514 		goto put_and_exit;
1515 	__inet_hash_nolisten(newsk, NULL);
1516 
1517 	return newsk;
1518 
1519 exit_overflow:
1520 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1521 exit_nonewsk:
1522 	dst_release(dst);
1523 exit:
1524 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1525 	return NULL;
1526 put_and_exit:
1527 	inet_csk_prepare_forced_close(newsk);
1528 	tcp_done(newsk);
1529 	goto exit;
1530 }
1531 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1532 
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1533 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1534 {
1535 	struct tcphdr *th = tcp_hdr(skb);
1536 	const struct iphdr *iph = ip_hdr(skb);
1537 	struct sock *nsk;
1538 	struct request_sock **prev;
1539 	/* Find possible connection requests. */
1540 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1541 						       iph->saddr, iph->daddr);
1542 	if (req)
1543 		return tcp_check_req(sk, skb, req, prev);
1544 
1545 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1546 			th->source, iph->daddr, th->dest, inet_iif(skb));
1547 
1548 	if (nsk) {
1549 		if (nsk->sk_state != TCP_TIME_WAIT) {
1550 			bh_lock_sock(nsk);
1551 			return nsk;
1552 		}
1553 		inet_twsk_put(inet_twsk(nsk));
1554 		return NULL;
1555 	}
1556 
1557 #ifdef CONFIG_SYN_COOKIES
1558 	if (!th->syn)
1559 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1560 #endif
1561 	return sk;
1562 }
1563 
tcp_v4_checksum_init(struct sk_buff * skb)1564 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1565 {
1566 	const struct iphdr *iph = ip_hdr(skb);
1567 
1568 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1569 		if (!tcp_v4_check(skb->len, iph->saddr,
1570 				  iph->daddr, skb->csum)) {
1571 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1572 			return 0;
1573 		}
1574 	}
1575 
1576 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1577 				       skb->len, IPPROTO_TCP, 0);
1578 
1579 	if (skb->len <= 76) {
1580 		return __skb_checksum_complete(skb);
1581 	}
1582 	return 0;
1583 }
1584 
1585 
1586 /* The socket must have it's spinlock held when we get
1587  * here.
1588  *
1589  * We have a potential double-lock case here, so even when
1590  * doing backlog processing we use the BH locking scheme.
1591  * This is because we cannot sleep with the original spinlock
1592  * held.
1593  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1594 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1595 {
1596 	struct sock *rsk;
1597 #ifdef CONFIG_TCP_MD5SIG
1598 	/*
1599 	 * We really want to reject the packet as early as possible
1600 	 * if:
1601 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1602 	 *  o There is an MD5 option and we're not expecting one
1603 	 */
1604 	if (tcp_v4_inbound_md5_hash(sk, skb))
1605 		goto discard;
1606 #endif
1607 
1608 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1609 		sock_rps_save_rxhash(sk, skb);
1610 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1611 			rsk = sk;
1612 			goto reset;
1613 		}
1614 		return 0;
1615 	}
1616 
1617 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1618 		goto csum_err;
1619 
1620 	if (sk->sk_state == TCP_LISTEN) {
1621 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1622 		if (!nsk)
1623 			goto discard;
1624 
1625 		if (nsk != sk) {
1626 			sock_rps_save_rxhash(nsk, skb);
1627 			if (tcp_child_process(sk, nsk, skb)) {
1628 				rsk = nsk;
1629 				goto reset;
1630 			}
1631 			return 0;
1632 		}
1633 	} else
1634 		sock_rps_save_rxhash(sk, skb);
1635 
1636 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1637 		rsk = sk;
1638 		goto reset;
1639 	}
1640 	return 0;
1641 
1642 reset:
1643 	tcp_v4_send_reset(rsk, skb);
1644 discard:
1645 	kfree_skb(skb);
1646 	/* Be careful here. If this function gets more complicated and
1647 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1648 	 * might be destroyed here. This current version compiles correctly,
1649 	 * but you have been warned.
1650 	 */
1651 	return 0;
1652 
1653 csum_err:
1654 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1655 	goto discard;
1656 }
1657 EXPORT_SYMBOL(tcp_v4_do_rcv);
1658 
1659 /*
1660  *	From tcp_input.c
1661  */
1662 
tcp_v4_rcv(struct sk_buff * skb)1663 int tcp_v4_rcv(struct sk_buff *skb)
1664 {
1665 	const struct iphdr *iph;
1666 	const struct tcphdr *th;
1667 	struct sock *sk;
1668 	int ret;
1669 	struct net *net = dev_net(skb->dev);
1670 
1671 	if (skb->pkt_type != PACKET_HOST)
1672 		goto discard_it;
1673 
1674 	/* Count it even if it's bad */
1675 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1676 
1677 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1678 		goto discard_it;
1679 
1680 	th = tcp_hdr(skb);
1681 
1682 	if (th->doff < sizeof(struct tcphdr) / 4)
1683 		goto bad_packet;
1684 	if (!pskb_may_pull(skb, th->doff * 4))
1685 		goto discard_it;
1686 
1687 	/* An explanation is required here, I think.
1688 	 * Packet length and doff are validated by header prediction,
1689 	 * provided case of th->doff==0 is eliminated.
1690 	 * So, we defer the checks. */
1691 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1692 		goto bad_packet;
1693 
1694 	th = tcp_hdr(skb);
1695 	iph = ip_hdr(skb);
1696 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1697 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1698 				    skb->len - th->doff * 4);
1699 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1700 	TCP_SKB_CB(skb)->when	 = 0;
1701 	TCP_SKB_CB(skb)->ip_dsfield = ipv4_get_dsfield(iph);
1702 	TCP_SKB_CB(skb)->sacked	 = 0;
1703 
1704 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1705 	if (!sk)
1706 		goto no_tcp_socket;
1707 
1708 process:
1709 	if (sk->sk_state == TCP_TIME_WAIT)
1710 		goto do_time_wait;
1711 
1712 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1713 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1714 		goto discard_and_relse;
1715 	}
1716 
1717 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1718 		goto discard_and_relse;
1719 	nf_reset(skb);
1720 
1721 	if (sk_filter(sk, skb))
1722 		goto discard_and_relse;
1723 
1724 	skb->dev = NULL;
1725 
1726 	bh_lock_sock_nested(sk);
1727 	ret = 0;
1728 	if (!sock_owned_by_user(sk)) {
1729 #ifdef CONFIG_NET_DMA
1730 		struct tcp_sock *tp = tcp_sk(sk);
1731 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1732 			tp->ucopy.dma_chan = net_dma_find_channel();
1733 		if (tp->ucopy.dma_chan)
1734 			ret = tcp_v4_do_rcv(sk, skb);
1735 		else
1736 #endif
1737 		{
1738 			if (!tcp_prequeue(sk, skb))
1739 				ret = tcp_v4_do_rcv(sk, skb);
1740 		}
1741 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1742 		bh_unlock_sock(sk);
1743 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1744 		goto discard_and_relse;
1745 	}
1746 	bh_unlock_sock(sk);
1747 
1748 	sock_put(sk);
1749 
1750 	return ret;
1751 
1752 no_tcp_socket:
1753 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1754 		goto discard_it;
1755 
1756 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1757 bad_packet:
1758 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1759 	} else {
1760 		tcp_v4_send_reset(NULL, skb);
1761 	}
1762 
1763 discard_it:
1764 	/* Discard frame. */
1765 	kfree_skb(skb);
1766 	return 0;
1767 
1768 discard_and_relse:
1769 	sock_put(sk);
1770 	goto discard_it;
1771 
1772 do_time_wait:
1773 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1774 		inet_twsk_put(inet_twsk(sk));
1775 		goto discard_it;
1776 	}
1777 
1778 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1779 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1780 		inet_twsk_put(inet_twsk(sk));
1781 		goto discard_it;
1782 	}
1783 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1784 	case TCP_TW_SYN: {
1785 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1786 							&tcp_hashinfo,
1787 							iph->daddr, th->dest,
1788 							inet_iif(skb));
1789 		if (sk2) {
1790 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1791 			inet_twsk_put(inet_twsk(sk));
1792 			sk = sk2;
1793 			goto process;
1794 		}
1795 		/* Fall through to ACK */
1796 	}
1797 	case TCP_TW_ACK:
1798 		tcp_v4_timewait_ack(sk, skb);
1799 		break;
1800 	case TCP_TW_RST:
1801 		goto no_tcp_socket;
1802 	case TCP_TW_SUCCESS:;
1803 	}
1804 	goto discard_it;
1805 }
1806 
tcp_v4_get_peer(struct sock * sk,bool * release_it)1807 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1808 {
1809 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1810 	struct inet_sock *inet = inet_sk(sk);
1811 	struct inet_peer *peer;
1812 
1813 	if (!rt ||
1814 	    inet->cork.fl.u.ip4.daddr != inet->inet_daddr) {
1815 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1816 		*release_it = true;
1817 	} else {
1818 		if (!rt->peer)
1819 			rt_bind_peer(rt, inet->inet_daddr, 1);
1820 		peer = rt->peer;
1821 		*release_it = false;
1822 	}
1823 
1824 	return peer;
1825 }
1826 EXPORT_SYMBOL(tcp_v4_get_peer);
1827 
tcp_v4_tw_get_peer(struct sock * sk)1828 void *tcp_v4_tw_get_peer(struct sock *sk)
1829 {
1830 	const struct inet_timewait_sock *tw = inet_twsk(sk);
1831 
1832 	return inet_getpeer_v4(tw->tw_daddr, 1);
1833 }
1834 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1835 
1836 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1837 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1838 	.twsk_unique	= tcp_twsk_unique,
1839 	.twsk_destructor= tcp_twsk_destructor,
1840 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1841 };
1842 
1843 const struct inet_connection_sock_af_ops ipv4_specific = {
1844 	.queue_xmit	   = ip_queue_xmit,
1845 	.send_check	   = tcp_v4_send_check,
1846 	.rebuild_header	   = inet_sk_rebuild_header,
1847 	.conn_request	   = tcp_v4_conn_request,
1848 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1849 	.get_peer	   = tcp_v4_get_peer,
1850 	.net_header_len	   = sizeof(struct iphdr),
1851 	.setsockopt	   = ip_setsockopt,
1852 	.getsockopt	   = ip_getsockopt,
1853 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1854 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1855 	.bind_conflict	   = inet_csk_bind_conflict,
1856 #ifdef CONFIG_COMPAT
1857 	.compat_setsockopt = compat_ip_setsockopt,
1858 	.compat_getsockopt = compat_ip_getsockopt,
1859 #endif
1860 };
1861 EXPORT_SYMBOL(ipv4_specific);
1862 
1863 #ifdef CONFIG_TCP_MD5SIG
1864 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1865 	.md5_lookup		= tcp_v4_md5_lookup,
1866 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1867 	.md5_parse		= tcp_v4_parse_md5_keys,
1868 };
1869 #endif
1870 
1871 /* NOTE: A lot of things set to zero explicitly by call to
1872  *       sk_alloc() so need not be done here.
1873  */
tcp_v4_init_sock(struct sock * sk)1874 static int tcp_v4_init_sock(struct sock *sk)
1875 {
1876 	struct inet_connection_sock *icsk = inet_csk(sk);
1877 	struct tcp_sock *tp = tcp_sk(sk);
1878 
1879 	skb_queue_head_init(&tp->out_of_order_queue);
1880 	tcp_init_xmit_timers(sk);
1881 	tcp_prequeue_init(tp);
1882 
1883 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1884 	tp->mdev = TCP_TIMEOUT_INIT;
1885 
1886 	/* So many TCP implementations out there (incorrectly) count the
1887 	 * initial SYN frame in their delayed-ACK and congestion control
1888 	 * algorithms that we must have the following bandaid to talk
1889 	 * efficiently to them.  -DaveM
1890 	 */
1891 	tp->snd_cwnd = TCP_INIT_CWND;
1892 
1893 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1894 	 * initialization of these values.
1895 	 */
1896 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1897 	tp->snd_cwnd_clamp = ~0;
1898 	tp->mss_cache = TCP_MSS_DEFAULT;
1899 
1900 	tp->reordering = sysctl_tcp_reordering;
1901 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1902 
1903 	sk->sk_state = TCP_CLOSE;
1904 
1905 	sk->sk_write_space = sk_stream_write_space;
1906 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1907 
1908 	icsk->icsk_af_ops = &ipv4_specific;
1909 	icsk->icsk_sync_mss = tcp_sync_mss;
1910 #ifdef CONFIG_TCP_MD5SIG
1911 	tp->af_specific = &tcp_sock_ipv4_specific;
1912 #endif
1913 
1914 	/* TCP Cookie Transactions */
1915 	if (sysctl_tcp_cookie_size > 0) {
1916 		/* Default, cookies without s_data_payload. */
1917 		tp->cookie_values =
1918 			kzalloc(sizeof(*tp->cookie_values),
1919 				sk->sk_allocation);
1920 		if (tp->cookie_values != NULL)
1921 			kref_init(&tp->cookie_values->kref);
1922 	}
1923 	/* Presumed zeroed, in order of appearance:
1924 	 *	cookie_in_always, cookie_out_never,
1925 	 *	s_data_constant, s_data_in, s_data_out
1926 	 */
1927 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1928 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1929 
1930 	local_bh_disable();
1931 	sock_update_memcg(sk);
1932 	sk_sockets_allocated_inc(sk);
1933 	local_bh_enable();
1934 
1935 	return 0;
1936 }
1937 
tcp_v4_destroy_sock(struct sock * sk)1938 void tcp_v4_destroy_sock(struct sock *sk)
1939 {
1940 	struct tcp_sock *tp = tcp_sk(sk);
1941 
1942 	tcp_clear_xmit_timers(sk);
1943 
1944 	tcp_cleanup_congestion_control(sk);
1945 
1946 	/* Cleanup up the write buffer. */
1947 	tcp_write_queue_purge(sk);
1948 
1949 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1950 	__skb_queue_purge(&tp->out_of_order_queue);
1951 
1952 #ifdef CONFIG_TCP_MD5SIG
1953 	/* Clean up the MD5 key list, if any */
1954 	if (tp->md5sig_info) {
1955 		tcp_clear_md5_list(sk);
1956 		kfree_rcu(tp->md5sig_info, rcu);
1957 		tp->md5sig_info = NULL;
1958 	}
1959 #endif
1960 
1961 #ifdef CONFIG_NET_DMA
1962 	/* Cleans up our sk_async_wait_queue */
1963 	__skb_queue_purge(&sk->sk_async_wait_queue);
1964 #endif
1965 
1966 	/* Clean prequeue, it must be empty really */
1967 	__skb_queue_purge(&tp->ucopy.prequeue);
1968 
1969 	/* Clean up a referenced TCP bind bucket. */
1970 	if (inet_csk(sk)->icsk_bind_hash)
1971 		inet_put_port(sk);
1972 
1973 	/*
1974 	 * If sendmsg cached page exists, toss it.
1975 	 */
1976 	if (sk->sk_sndmsg_page) {
1977 		__free_page(sk->sk_sndmsg_page);
1978 		sk->sk_sndmsg_page = NULL;
1979 	}
1980 
1981 	/* TCP Cookie Transactions */
1982 	if (tp->cookie_values != NULL) {
1983 		kref_put(&tp->cookie_values->kref,
1984 			 tcp_cookie_values_release);
1985 		tp->cookie_values = NULL;
1986 	}
1987 
1988 	sk_sockets_allocated_dec(sk);
1989 	sock_release_memcg(sk);
1990 }
1991 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1992 
1993 #ifdef CONFIG_PROC_FS
1994 /* Proc filesystem TCP sock list dumping. */
1995 
tw_head(struct hlist_nulls_head * head)1996 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1997 {
1998 	return hlist_nulls_empty(head) ? NULL :
1999 		list_entry(head->first, struct inet_timewait_sock, tw_node);
2000 }
2001 
tw_next(struct inet_timewait_sock * tw)2002 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
2003 {
2004 	return !is_a_nulls(tw->tw_node.next) ?
2005 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
2006 }
2007 
2008 /*
2009  * Get next listener socket follow cur.  If cur is NULL, get first socket
2010  * starting from bucket given in st->bucket; when st->bucket is zero the
2011  * very first socket in the hash table is returned.
2012  */
listening_get_next(struct seq_file * seq,void * cur)2013 static void *listening_get_next(struct seq_file *seq, void *cur)
2014 {
2015 	struct inet_connection_sock *icsk;
2016 	struct hlist_nulls_node *node;
2017 	struct sock *sk = cur;
2018 	struct inet_listen_hashbucket *ilb;
2019 	struct tcp_iter_state *st = seq->private;
2020 	struct net *net = seq_file_net(seq);
2021 
2022 	if (!sk) {
2023 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2024 		spin_lock_bh(&ilb->lock);
2025 		sk = sk_nulls_head(&ilb->head);
2026 		st->offset = 0;
2027 		goto get_sk;
2028 	}
2029 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
2030 	++st->num;
2031 	++st->offset;
2032 
2033 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
2034 		struct request_sock *req = cur;
2035 
2036 		icsk = inet_csk(st->syn_wait_sk);
2037 		req = req->dl_next;
2038 		while (1) {
2039 			while (req) {
2040 				if (req->rsk_ops->family == st->family) {
2041 					cur = req;
2042 					goto out;
2043 				}
2044 				req = req->dl_next;
2045 			}
2046 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
2047 				break;
2048 get_req:
2049 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2050 		}
2051 		sk	  = sk_nulls_next(st->syn_wait_sk);
2052 		st->state = TCP_SEQ_STATE_LISTENING;
2053 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2054 	} else {
2055 		icsk = inet_csk(sk);
2056 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2057 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2058 			goto start_req;
2059 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2060 		sk = sk_nulls_next(sk);
2061 	}
2062 get_sk:
2063 	sk_nulls_for_each_from(sk, node) {
2064 		if (!net_eq(sock_net(sk), net))
2065 			continue;
2066 		if (sk->sk_family == st->family) {
2067 			cur = sk;
2068 			goto out;
2069 		}
2070 		icsk = inet_csk(sk);
2071 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2072 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2073 start_req:
2074 			st->uid		= sock_i_uid(sk);
2075 			st->syn_wait_sk = sk;
2076 			st->state	= TCP_SEQ_STATE_OPENREQ;
2077 			st->sbucket	= 0;
2078 			goto get_req;
2079 		}
2080 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2081 	}
2082 	spin_unlock_bh(&ilb->lock);
2083 	st->offset = 0;
2084 	if (++st->bucket < INET_LHTABLE_SIZE) {
2085 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2086 		spin_lock_bh(&ilb->lock);
2087 		sk = sk_nulls_head(&ilb->head);
2088 		goto get_sk;
2089 	}
2090 	cur = NULL;
2091 out:
2092 	return cur;
2093 }
2094 
listening_get_idx(struct seq_file * seq,loff_t * pos)2095 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2096 {
2097 	struct tcp_iter_state *st = seq->private;
2098 	void *rc;
2099 
2100 	st->bucket = 0;
2101 	st->offset = 0;
2102 	rc = listening_get_next(seq, NULL);
2103 
2104 	while (rc && *pos) {
2105 		rc = listening_get_next(seq, rc);
2106 		--*pos;
2107 	}
2108 	return rc;
2109 }
2110 
empty_bucket(struct tcp_iter_state * st)2111 static inline int empty_bucket(struct tcp_iter_state *st)
2112 {
2113 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2114 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2115 }
2116 
2117 /*
2118  * Get first established socket starting from bucket given in st->bucket.
2119  * If st->bucket is zero, the very first socket in the hash is returned.
2120  */
established_get_first(struct seq_file * seq)2121 static void *established_get_first(struct seq_file *seq)
2122 {
2123 	struct tcp_iter_state *st = seq->private;
2124 	struct net *net = seq_file_net(seq);
2125 	void *rc = NULL;
2126 
2127 	st->offset = 0;
2128 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2129 		struct sock *sk;
2130 		struct hlist_nulls_node *node;
2131 		struct inet_timewait_sock *tw;
2132 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2133 
2134 		/* Lockless fast path for the common case of empty buckets */
2135 		if (empty_bucket(st))
2136 			continue;
2137 
2138 		spin_lock_bh(lock);
2139 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2140 			if (sk->sk_family != st->family ||
2141 			    !net_eq(sock_net(sk), net)) {
2142 				continue;
2143 			}
2144 			rc = sk;
2145 			goto out;
2146 		}
2147 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2148 		inet_twsk_for_each(tw, node,
2149 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2150 			if (tw->tw_family != st->family ||
2151 			    !net_eq(twsk_net(tw), net)) {
2152 				continue;
2153 			}
2154 			rc = tw;
2155 			goto out;
2156 		}
2157 		spin_unlock_bh(lock);
2158 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2159 	}
2160 out:
2161 	return rc;
2162 }
2163 
established_get_next(struct seq_file * seq,void * cur)2164 static void *established_get_next(struct seq_file *seq, void *cur)
2165 {
2166 	struct sock *sk = cur;
2167 	struct inet_timewait_sock *tw;
2168 	struct hlist_nulls_node *node;
2169 	struct tcp_iter_state *st = seq->private;
2170 	struct net *net = seq_file_net(seq);
2171 
2172 	++st->num;
2173 	++st->offset;
2174 
2175 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2176 		tw = cur;
2177 		tw = tw_next(tw);
2178 get_tw:
2179 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2180 			tw = tw_next(tw);
2181 		}
2182 		if (tw) {
2183 			cur = tw;
2184 			goto out;
2185 		}
2186 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2187 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2188 
2189 		/* Look for next non empty bucket */
2190 		st->offset = 0;
2191 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2192 				empty_bucket(st))
2193 			;
2194 		if (st->bucket > tcp_hashinfo.ehash_mask)
2195 			return NULL;
2196 
2197 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2198 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2199 	} else
2200 		sk = sk_nulls_next(sk);
2201 
2202 	sk_nulls_for_each_from(sk, node) {
2203 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2204 			goto found;
2205 	}
2206 
2207 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2208 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2209 	goto get_tw;
2210 found:
2211 	cur = sk;
2212 out:
2213 	return cur;
2214 }
2215 
established_get_idx(struct seq_file * seq,loff_t pos)2216 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2217 {
2218 	struct tcp_iter_state *st = seq->private;
2219 	void *rc;
2220 
2221 	st->bucket = 0;
2222 	rc = established_get_first(seq);
2223 
2224 	while (rc && pos) {
2225 		rc = established_get_next(seq, rc);
2226 		--pos;
2227 	}
2228 	return rc;
2229 }
2230 
tcp_get_idx(struct seq_file * seq,loff_t pos)2231 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2232 {
2233 	void *rc;
2234 	struct tcp_iter_state *st = seq->private;
2235 
2236 	st->state = TCP_SEQ_STATE_LISTENING;
2237 	rc	  = listening_get_idx(seq, &pos);
2238 
2239 	if (!rc) {
2240 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2241 		rc	  = established_get_idx(seq, pos);
2242 	}
2243 
2244 	return rc;
2245 }
2246 
tcp_seek_last_pos(struct seq_file * seq)2247 static void *tcp_seek_last_pos(struct seq_file *seq)
2248 {
2249 	struct tcp_iter_state *st = seq->private;
2250 	int offset = st->offset;
2251 	int orig_num = st->num;
2252 	void *rc = NULL;
2253 
2254 	switch (st->state) {
2255 	case TCP_SEQ_STATE_OPENREQ:
2256 	case TCP_SEQ_STATE_LISTENING:
2257 		if (st->bucket >= INET_LHTABLE_SIZE)
2258 			break;
2259 		st->state = TCP_SEQ_STATE_LISTENING;
2260 		rc = listening_get_next(seq, NULL);
2261 		while (offset-- && rc)
2262 			rc = listening_get_next(seq, rc);
2263 		if (rc)
2264 			break;
2265 		st->bucket = 0;
2266 		/* Fallthrough */
2267 	case TCP_SEQ_STATE_ESTABLISHED:
2268 	case TCP_SEQ_STATE_TIME_WAIT:
2269 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2270 		if (st->bucket > tcp_hashinfo.ehash_mask)
2271 			break;
2272 		rc = established_get_first(seq);
2273 		while (offset-- && rc)
2274 			rc = established_get_next(seq, rc);
2275 	}
2276 
2277 	st->num = orig_num;
2278 
2279 	return rc;
2280 }
2281 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2282 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2283 {
2284 	struct tcp_iter_state *st = seq->private;
2285 	void *rc;
2286 
2287 	if (*pos && *pos == st->last_pos) {
2288 		rc = tcp_seek_last_pos(seq);
2289 		if (rc)
2290 			goto out;
2291 	}
2292 
2293 	st->state = TCP_SEQ_STATE_LISTENING;
2294 	st->num = 0;
2295 	st->bucket = 0;
2296 	st->offset = 0;
2297 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2298 
2299 out:
2300 	st->last_pos = *pos;
2301 	return rc;
2302 }
2303 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2304 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2305 {
2306 	struct tcp_iter_state *st = seq->private;
2307 	void *rc = NULL;
2308 
2309 	if (v == SEQ_START_TOKEN) {
2310 		rc = tcp_get_idx(seq, 0);
2311 		goto out;
2312 	}
2313 
2314 	switch (st->state) {
2315 	case TCP_SEQ_STATE_OPENREQ:
2316 	case TCP_SEQ_STATE_LISTENING:
2317 		rc = listening_get_next(seq, v);
2318 		if (!rc) {
2319 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2320 			st->bucket = 0;
2321 			st->offset = 0;
2322 			rc	  = established_get_first(seq);
2323 		}
2324 		break;
2325 	case TCP_SEQ_STATE_ESTABLISHED:
2326 	case TCP_SEQ_STATE_TIME_WAIT:
2327 		rc = established_get_next(seq, v);
2328 		break;
2329 	}
2330 out:
2331 	++*pos;
2332 	st->last_pos = *pos;
2333 	return rc;
2334 }
2335 
tcp_seq_stop(struct seq_file * seq,void * v)2336 static void tcp_seq_stop(struct seq_file *seq, void *v)
2337 {
2338 	struct tcp_iter_state *st = seq->private;
2339 
2340 	switch (st->state) {
2341 	case TCP_SEQ_STATE_OPENREQ:
2342 		if (v) {
2343 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2344 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2345 		}
2346 	case TCP_SEQ_STATE_LISTENING:
2347 		if (v != SEQ_START_TOKEN)
2348 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2349 		break;
2350 	case TCP_SEQ_STATE_TIME_WAIT:
2351 	case TCP_SEQ_STATE_ESTABLISHED:
2352 		if (v)
2353 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2354 		break;
2355 	}
2356 }
2357 
tcp_seq_open(struct inode * inode,struct file * file)2358 int tcp_seq_open(struct inode *inode, struct file *file)
2359 {
2360 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2361 	struct tcp_iter_state *s;
2362 	int err;
2363 
2364 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2365 			  sizeof(struct tcp_iter_state));
2366 	if (err < 0)
2367 		return err;
2368 
2369 	s = ((struct seq_file *)file->private_data)->private;
2370 	s->family		= afinfo->family;
2371 	s->last_pos 		= 0;
2372 	return 0;
2373 }
2374 EXPORT_SYMBOL(tcp_seq_open);
2375 
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2376 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2377 {
2378 	int rc = 0;
2379 	struct proc_dir_entry *p;
2380 
2381 	afinfo->seq_ops.start		= tcp_seq_start;
2382 	afinfo->seq_ops.next		= tcp_seq_next;
2383 	afinfo->seq_ops.stop		= tcp_seq_stop;
2384 
2385 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2386 			     afinfo->seq_fops, afinfo);
2387 	if (!p)
2388 		rc = -ENOMEM;
2389 	return rc;
2390 }
2391 EXPORT_SYMBOL(tcp_proc_register);
2392 
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2393 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2394 {
2395 	proc_net_remove(net, afinfo->name);
2396 }
2397 EXPORT_SYMBOL(tcp_proc_unregister);
2398 
get_openreq4(const struct sock * sk,const struct request_sock * req,struct seq_file * f,int i,int uid,int * len)2399 static void get_openreq4(const struct sock *sk, const struct request_sock *req,
2400 			 struct seq_file *f, int i, int uid, int *len)
2401 {
2402 	const struct inet_request_sock *ireq = inet_rsk(req);
2403 	int ttd = req->expires - jiffies;
2404 
2405 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2406 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %pK%n",
2407 		i,
2408 		ireq->loc_addr,
2409 		ntohs(inet_sk(sk)->inet_sport),
2410 		ireq->rmt_addr,
2411 		ntohs(ireq->rmt_port),
2412 		TCP_SYN_RECV,
2413 		0, 0, /* could print option size, but that is af dependent. */
2414 		1,    /* timers active (only the expire timer) */
2415 		jiffies_to_clock_t(ttd),
2416 		req->retrans,
2417 		uid,
2418 		0,  /* non standard timer */
2419 		0, /* open_requests have no inode */
2420 		atomic_read(&sk->sk_refcnt),
2421 		req,
2422 		len);
2423 }
2424 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i,int * len)2425 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2426 {
2427 	int timer_active;
2428 	unsigned long timer_expires;
2429 	const struct tcp_sock *tp = tcp_sk(sk);
2430 	const struct inet_connection_sock *icsk = inet_csk(sk);
2431 	const struct inet_sock *inet = inet_sk(sk);
2432 	__be32 dest = inet->inet_daddr;
2433 	__be32 src = inet->inet_rcv_saddr;
2434 	__u16 destp = ntohs(inet->inet_dport);
2435 	__u16 srcp = ntohs(inet->inet_sport);
2436 	int rx_queue;
2437 
2438 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2439 		timer_active	= 1;
2440 		timer_expires	= icsk->icsk_timeout;
2441 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2442 		timer_active	= 4;
2443 		timer_expires	= icsk->icsk_timeout;
2444 	} else if (timer_pending(&sk->sk_timer)) {
2445 		timer_active	= 2;
2446 		timer_expires	= sk->sk_timer.expires;
2447 	} else {
2448 		timer_active	= 0;
2449 		timer_expires = jiffies;
2450 	}
2451 
2452 	if (sk->sk_state == TCP_LISTEN)
2453 		rx_queue = sk->sk_ack_backlog;
2454 	else
2455 		/*
2456 		 * because we dont lock socket, we might find a transient negative value
2457 		 */
2458 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2459 
2460 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2461 			"%08X %5d %8d %lu %d %pK %lu %lu %u %u %d%n",
2462 		i, src, srcp, dest, destp, sk->sk_state,
2463 		tp->write_seq - tp->snd_una,
2464 		rx_queue,
2465 		timer_active,
2466 		jiffies_to_clock_t(timer_expires - jiffies),
2467 		icsk->icsk_retransmits,
2468 		sock_i_uid(sk),
2469 		icsk->icsk_probes_out,
2470 		sock_i_ino(sk),
2471 		atomic_read(&sk->sk_refcnt), sk,
2472 		jiffies_to_clock_t(icsk->icsk_rto),
2473 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2474 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2475 		tp->snd_cwnd,
2476 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2477 		len);
2478 }
2479 
get_timewait4_sock(const struct inet_timewait_sock * tw,struct seq_file * f,int i,int * len)2480 static void get_timewait4_sock(const struct inet_timewait_sock *tw,
2481 			       struct seq_file *f, int i, int *len)
2482 {
2483 	__be32 dest, src;
2484 	__u16 destp, srcp;
2485 	int ttd = tw->tw_ttd - jiffies;
2486 
2487 	if (ttd < 0)
2488 		ttd = 0;
2489 
2490 	dest  = tw->tw_daddr;
2491 	src   = tw->tw_rcv_saddr;
2492 	destp = ntohs(tw->tw_dport);
2493 	srcp  = ntohs(tw->tw_sport);
2494 
2495 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2496 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK%n",
2497 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2498 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2499 		atomic_read(&tw->tw_refcnt), tw, len);
2500 }
2501 
2502 #define TMPSZ 150
2503 
tcp4_seq_show(struct seq_file * seq,void * v)2504 static int tcp4_seq_show(struct seq_file *seq, void *v)
2505 {
2506 	struct tcp_iter_state *st;
2507 	int len;
2508 
2509 	if (v == SEQ_START_TOKEN) {
2510 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2511 			   "  sl  local_address rem_address   st tx_queue "
2512 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2513 			   "inode");
2514 		goto out;
2515 	}
2516 	st = seq->private;
2517 
2518 	switch (st->state) {
2519 	case TCP_SEQ_STATE_LISTENING:
2520 	case TCP_SEQ_STATE_ESTABLISHED:
2521 		get_tcp4_sock(v, seq, st->num, &len);
2522 		break;
2523 	case TCP_SEQ_STATE_OPENREQ:
2524 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2525 		break;
2526 	case TCP_SEQ_STATE_TIME_WAIT:
2527 		get_timewait4_sock(v, seq, st->num, &len);
2528 		break;
2529 	}
2530 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2531 out:
2532 	return 0;
2533 }
2534 
2535 static const struct file_operations tcp_afinfo_seq_fops = {
2536 	.owner   = THIS_MODULE,
2537 	.open    = tcp_seq_open,
2538 	.read    = seq_read,
2539 	.llseek  = seq_lseek,
2540 	.release = seq_release_net
2541 };
2542 
2543 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2544 	.name		= "tcp",
2545 	.family		= AF_INET,
2546 	.seq_fops	= &tcp_afinfo_seq_fops,
2547 	.seq_ops	= {
2548 		.show		= tcp4_seq_show,
2549 	},
2550 };
2551 
tcp4_proc_init_net(struct net * net)2552 static int __net_init tcp4_proc_init_net(struct net *net)
2553 {
2554 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2555 }
2556 
tcp4_proc_exit_net(struct net * net)2557 static void __net_exit tcp4_proc_exit_net(struct net *net)
2558 {
2559 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2560 }
2561 
2562 static struct pernet_operations tcp4_net_ops = {
2563 	.init = tcp4_proc_init_net,
2564 	.exit = tcp4_proc_exit_net,
2565 };
2566 
tcp4_proc_init(void)2567 int __init tcp4_proc_init(void)
2568 {
2569 	return register_pernet_subsys(&tcp4_net_ops);
2570 }
2571 
tcp4_proc_exit(void)2572 void tcp4_proc_exit(void)
2573 {
2574 	unregister_pernet_subsys(&tcp4_net_ops);
2575 }
2576 #endif /* CONFIG_PROC_FS */
2577 
tcp4_gro_receive(struct sk_buff ** head,struct sk_buff * skb)2578 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2579 {
2580 	const struct iphdr *iph = skb_gro_network_header(skb);
2581 
2582 	switch (skb->ip_summed) {
2583 	case CHECKSUM_COMPLETE:
2584 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2585 				  skb->csum)) {
2586 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2587 			break;
2588 		}
2589 
2590 		/* fall through */
2591 	case CHECKSUM_NONE:
2592 		NAPI_GRO_CB(skb)->flush = 1;
2593 		return NULL;
2594 	}
2595 
2596 	return tcp_gro_receive(head, skb);
2597 }
2598 
tcp4_gro_complete(struct sk_buff * skb)2599 int tcp4_gro_complete(struct sk_buff *skb)
2600 {
2601 	const struct iphdr *iph = ip_hdr(skb);
2602 	struct tcphdr *th = tcp_hdr(skb);
2603 
2604 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2605 				  iph->saddr, iph->daddr, 0);
2606 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2607 
2608 	return tcp_gro_complete(skb);
2609 }
2610 
2611 struct proto tcp_prot = {
2612 	.name			= "TCP",
2613 	.owner			= THIS_MODULE,
2614 	.close			= tcp_close,
2615 	.connect		= tcp_v4_connect,
2616 	.disconnect		= tcp_disconnect,
2617 	.accept			= inet_csk_accept,
2618 	.ioctl			= tcp_ioctl,
2619 	.init			= tcp_v4_init_sock,
2620 	.destroy		= tcp_v4_destroy_sock,
2621 	.shutdown		= tcp_shutdown,
2622 	.setsockopt		= tcp_setsockopt,
2623 	.getsockopt		= tcp_getsockopt,
2624 	.recvmsg		= tcp_recvmsg,
2625 	.sendmsg		= tcp_sendmsg,
2626 	.sendpage		= tcp_sendpage,
2627 	.backlog_rcv		= tcp_v4_do_rcv,
2628 	.hash			= inet_hash,
2629 	.unhash			= inet_unhash,
2630 	.get_port		= inet_csk_get_port,
2631 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2632 	.sockets_allocated	= &tcp_sockets_allocated,
2633 	.orphan_count		= &tcp_orphan_count,
2634 	.memory_allocated	= &tcp_memory_allocated,
2635 	.memory_pressure	= &tcp_memory_pressure,
2636 	.sysctl_wmem		= sysctl_tcp_wmem,
2637 	.sysctl_rmem		= sysctl_tcp_rmem,
2638 	.max_header		= MAX_TCP_HEADER,
2639 	.obj_size		= sizeof(struct tcp_sock),
2640 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2641 	.twsk_prot		= &tcp_timewait_sock_ops,
2642 	.rsk_prot		= &tcp_request_sock_ops,
2643 	.h.hashinfo		= &tcp_hashinfo,
2644 	.no_autobind		= true,
2645 #ifdef CONFIG_COMPAT
2646 	.compat_setsockopt	= compat_tcp_setsockopt,
2647 	.compat_getsockopt	= compat_tcp_getsockopt,
2648 #endif
2649 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
2650 	.init_cgroup		= tcp_init_cgroup,
2651 	.destroy_cgroup		= tcp_destroy_cgroup,
2652 	.proto_cgroup		= tcp_proto_cgroup,
2653 #endif
2654 };
2655 EXPORT_SYMBOL(tcp_prot);
2656 
tcp_sk_init(struct net * net)2657 static int __net_init tcp_sk_init(struct net *net)
2658 {
2659 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2660 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2661 }
2662 
tcp_sk_exit(struct net * net)2663 static void __net_exit tcp_sk_exit(struct net *net)
2664 {
2665 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2666 }
2667 
tcp_sk_exit_batch(struct list_head * net_exit_list)2668 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2669 {
2670 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2671 }
2672 
2673 static struct pernet_operations __net_initdata tcp_sk_ops = {
2674        .init	   = tcp_sk_init,
2675        .exit	   = tcp_sk_exit,
2676        .exit_batch = tcp_sk_exit_batch,
2677 };
2678 
tcp_v4_init(void)2679 void __init tcp_v4_init(void)
2680 {
2681 	inet_hashinfo_init(&tcp_hashinfo);
2682 	if (register_pernet_subsys(&tcp_sk_ops))
2683 		panic("Failed to create the TCP control socket.\n");
2684 }
2685