1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  *		IPv4 specific functions
9  *
10  *
11  *		code split from:
12  *		linux/ipv4/tcp.c
13  *		linux/ipv4/tcp_input.c
14  *		linux/ipv4/tcp_output.c
15  *
16  *		See tcp.c for author information
17  *
18  *	This program is free software; you can redistribute it and/or
19  *      modify it under the terms of the GNU General Public License
20  *      as published by the Free Software Foundation; either version
21  *      2 of the License, or (at your option) any later version.
22  */
23 
24 /*
25  * Changes:
26  *		David S. Miller	:	New socket lookup architecture.
27  *					This code is dedicated to John Dyson.
28  *		David S. Miller :	Change semantics of established hash,
29  *					half is devoted to TIME_WAIT sockets
30  *					and the rest go in the other half.
31  *		Andi Kleen :		Add support for syncookies and fixed
32  *					some bugs: ip options weren't passed to
33  *					the TCP layer, missed a check for an
34  *					ACK bit.
35  *		Andi Kleen :		Implemented fast path mtu discovery.
36  *	     				Fixed many serious bugs in the
37  *					request_sock handling and moved
38  *					most of it into the af independent code.
39  *					Added tail drop and some other bugfixes.
40  *					Added new listen semantics.
41  *		Mike McLagan	:	Routing by source
42  *	Juan Jose Ciarlante:		ip_dynaddr bits
43  *		Andi Kleen:		various fixes.
44  *	Vitaly E. Lavrov	:	Transparent proxy revived after year
45  *					coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 
54 #include <linux/bottom_half.h>
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/module.h>
58 #include <linux/random.h>
59 #include <linux/cache.h>
60 #include <linux/jhash.h>
61 #include <linux/init.h>
62 #include <linux/times.h>
63 #include <linux/slab.h>
64 
65 #include <net/net_namespace.h>
66 #include <net/icmp.h>
67 #include <net/inet_hashtables.h>
68 #include <net/tcp.h>
69 #include <net/transp_v6.h>
70 #include <net/ipv6.h>
71 #include <net/inet_common.h>
72 #include <net/timewait_sock.h>
73 #include <net/xfrm.h>
74 #include <net/netdma.h>
75 
76 #include <linux/inet.h>
77 #include <linux/ipv6.h>
78 #include <linux/stddef.h>
79 #include <linux/proc_fs.h>
80 #include <linux/seq_file.h>
81 
82 #include <linux/crypto.h>
83 #include <linux/scatterlist.h>
84 
85 int sysctl_tcp_tw_reuse __read_mostly;
86 int sysctl_tcp_low_latency __read_mostly;
87 EXPORT_SYMBOL(sysctl_tcp_low_latency);
88 
89 
90 #ifdef CONFIG_TCP_MD5SIG
91 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
92 						   __be32 addr);
93 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
94 			       __be32 daddr, __be32 saddr, struct tcphdr *th);
95 #else
96 static inline
tcp_v4_md5_do_lookup(struct sock * sk,__be32 addr)97 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
98 {
99 	return NULL;
100 }
101 #endif
102 
103 struct inet_hashinfo tcp_hashinfo;
104 EXPORT_SYMBOL(tcp_hashinfo);
105 
tcp_v4_init_sequence(struct sk_buff * skb)106 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
107 {
108 	return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
109 					  ip_hdr(skb)->saddr,
110 					  tcp_hdr(skb)->dest,
111 					  tcp_hdr(skb)->source);
112 }
113 
tcp_twsk_unique(struct sock * sk,struct sock * sktw,void * twp)114 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
115 {
116 	const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
117 	struct tcp_sock *tp = tcp_sk(sk);
118 
119 	/* With PAWS, it is safe from the viewpoint
120 	   of data integrity. Even without PAWS it is safe provided sequence
121 	   spaces do not overlap i.e. at data rates <= 80Mbit/sec.
122 
123 	   Actually, the idea is close to VJ's one, only timestamp cache is
124 	   held not per host, but per port pair and TW bucket is used as state
125 	   holder.
126 
127 	   If TW bucket has been already destroyed we fall back to VJ's scheme
128 	   and use initial timestamp retrieved from peer table.
129 	 */
130 	if (tcptw->tw_ts_recent_stamp &&
131 	    (twp == NULL || (sysctl_tcp_tw_reuse &&
132 			     get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
133 		tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
134 		if (tp->write_seq == 0)
135 			tp->write_seq = 1;
136 		tp->rx_opt.ts_recent	   = tcptw->tw_ts_recent;
137 		tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
138 		sock_hold(sktw);
139 		return 1;
140 	}
141 
142 	return 0;
143 }
144 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
145 
146 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)147 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
148 {
149 	struct inet_sock *inet = inet_sk(sk);
150 	struct tcp_sock *tp = tcp_sk(sk);
151 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
152 	__be16 orig_sport, orig_dport;
153 	struct rtable *rt;
154 	__be32 daddr, nexthop;
155 	int err;
156 
157 	if (addr_len < sizeof(struct sockaddr_in))
158 		return -EINVAL;
159 
160 	if (usin->sin_family != AF_INET)
161 		return -EAFNOSUPPORT;
162 
163 	nexthop = daddr = usin->sin_addr.s_addr;
164 	if (inet->opt && inet->opt->srr) {
165 		if (!daddr)
166 			return -EINVAL;
167 		nexthop = inet->opt->faddr;
168 	}
169 
170 	orig_sport = inet->inet_sport;
171 	orig_dport = usin->sin_port;
172 	rt = ip_route_connect(nexthop, inet->inet_saddr,
173 			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
174 			      IPPROTO_TCP,
175 			      orig_sport, orig_dport, sk, true);
176 	if (IS_ERR(rt)) {
177 		err = PTR_ERR(rt);
178 		if (err == -ENETUNREACH)
179 			IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
180 		return err;
181 	}
182 
183 	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
184 		ip_rt_put(rt);
185 		return -ENETUNREACH;
186 	}
187 
188 	if (!inet->opt || !inet->opt->srr)
189 		daddr = rt->rt_dst;
190 
191 	if (!inet->inet_saddr)
192 		inet->inet_saddr = rt->rt_src;
193 	inet->inet_rcv_saddr = inet->inet_saddr;
194 
195 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
196 		/* Reset inherited state */
197 		tp->rx_opt.ts_recent	   = 0;
198 		tp->rx_opt.ts_recent_stamp = 0;
199 		tp->write_seq		   = 0;
200 	}
201 
202 	if (tcp_death_row.sysctl_tw_recycle &&
203 	    !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
204 		struct inet_peer *peer = rt_get_peer(rt);
205 		/*
206 		 * VJ's idea. We save last timestamp seen from
207 		 * the destination in peer table, when entering state
208 		 * TIME-WAIT * and initialize rx_opt.ts_recent from it,
209 		 * when trying new connection.
210 		 */
211 		if (peer) {
212 			inet_peer_refcheck(peer);
213 			if ((u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
214 				tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
215 				tp->rx_opt.ts_recent = peer->tcp_ts;
216 			}
217 		}
218 	}
219 
220 	inet->inet_dport = usin->sin_port;
221 	inet->inet_daddr = daddr;
222 
223 	inet_csk(sk)->icsk_ext_hdr_len = 0;
224 	if (inet->opt)
225 		inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
226 
227 	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
228 
229 	/* Socket identity is still unknown (sport may be zero).
230 	 * However we set state to SYN-SENT and not releasing socket
231 	 * lock select source port, enter ourselves into the hash tables and
232 	 * complete initialization after this.
233 	 */
234 	tcp_set_state(sk, TCP_SYN_SENT);
235 	err = inet_hash_connect(&tcp_death_row, sk);
236 	if (err)
237 		goto failure;
238 
239 	rt = ip_route_newports(rt, IPPROTO_TCP,
240 			       orig_sport, orig_dport,
241 			       inet->inet_sport, inet->inet_dport, sk);
242 	if (IS_ERR(rt)) {
243 		err = PTR_ERR(rt);
244 		rt = NULL;
245 		goto failure;
246 	}
247 	/* OK, now commit destination to socket.  */
248 	sk->sk_gso_type = SKB_GSO_TCPV4;
249 	sk_setup_caps(sk, &rt->dst);
250 
251 	if (!tp->write_seq)
252 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
253 							   inet->inet_daddr,
254 							   inet->inet_sport,
255 							   usin->sin_port);
256 
257 	inet->inet_id = tp->write_seq ^ jiffies;
258 
259 	err = tcp_connect(sk);
260 	rt = NULL;
261 	if (err)
262 		goto failure;
263 
264 	return 0;
265 
266 failure:
267 	/*
268 	 * This unhashes the socket and releases the local port,
269 	 * if necessary.
270 	 */
271 	tcp_set_state(sk, TCP_CLOSE);
272 	ip_rt_put(rt);
273 	sk->sk_route_caps = 0;
274 	inet->inet_dport = 0;
275 	return err;
276 }
277 EXPORT_SYMBOL(tcp_v4_connect);
278 
279 /*
280  * This routine does path mtu discovery as defined in RFC1191.
281  */
do_pmtu_discovery(struct sock * sk,struct iphdr * iph,u32 mtu)282 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
283 {
284 	struct dst_entry *dst;
285 	struct inet_sock *inet = inet_sk(sk);
286 
287 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
288 	 * send out by Linux are always <576bytes so they should go through
289 	 * unfragmented).
290 	 */
291 	if (sk->sk_state == TCP_LISTEN)
292 		return;
293 
294 	/* We don't check in the destentry if pmtu discovery is forbidden
295 	 * on this route. We just assume that no packet_to_big packets
296 	 * are send back when pmtu discovery is not active.
297 	 * There is a small race when the user changes this flag in the
298 	 * route, but I think that's acceptable.
299 	 */
300 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
301 		return;
302 
303 	dst->ops->update_pmtu(dst, mtu);
304 
305 	/* Something is about to be wrong... Remember soft error
306 	 * for the case, if this connection will not able to recover.
307 	 */
308 	if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
309 		sk->sk_err_soft = EMSGSIZE;
310 
311 	mtu = dst_mtu(dst);
312 
313 	if (inet->pmtudisc != IP_PMTUDISC_DONT &&
314 	    inet_csk(sk)->icsk_pmtu_cookie > mtu) {
315 		tcp_sync_mss(sk, mtu);
316 
317 		/* Resend the TCP packet because it's
318 		 * clear that the old packet has been
319 		 * dropped. This is the new "fast" path mtu
320 		 * discovery.
321 		 */
322 		tcp_simple_retransmit(sk);
323 	} /* else let the usual retransmit timer handle it */
324 }
325 
326 /*
327  * This routine is called by the ICMP module when it gets some
328  * sort of error condition.  If err < 0 then the socket should
329  * be closed and the error returned to the user.  If err > 0
330  * it's just the icmp type << 8 | icmp code.  After adjustment
331  * header points to the first 8 bytes of the tcp header.  We need
332  * to find the appropriate port.
333  *
334  * The locking strategy used here is very "optimistic". When
335  * someone else accesses the socket the ICMP is just dropped
336  * and for some paths there is no check at all.
337  * A more general error queue to queue errors for later handling
338  * is probably better.
339  *
340  */
341 
tcp_v4_err(struct sk_buff * icmp_skb,u32 info)342 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
343 {
344 	struct iphdr *iph = (struct iphdr *)icmp_skb->data;
345 	struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
346 	struct inet_connection_sock *icsk;
347 	struct tcp_sock *tp;
348 	struct inet_sock *inet;
349 	const int type = icmp_hdr(icmp_skb)->type;
350 	const int code = icmp_hdr(icmp_skb)->code;
351 	struct sock *sk;
352 	struct sk_buff *skb;
353 	__u32 seq;
354 	__u32 remaining;
355 	int err;
356 	struct net *net = dev_net(icmp_skb->dev);
357 
358 	if (icmp_skb->len < (iph->ihl << 2) + 8) {
359 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
360 		return;
361 	}
362 
363 	sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
364 			iph->saddr, th->source, inet_iif(icmp_skb));
365 	if (!sk) {
366 		ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
367 		return;
368 	}
369 	if (sk->sk_state == TCP_TIME_WAIT) {
370 		inet_twsk_put(inet_twsk(sk));
371 		return;
372 	}
373 
374 	bh_lock_sock(sk);
375 	/* If too many ICMPs get dropped on busy
376 	 * servers this needs to be solved differently.
377 	 */
378 	if (sock_owned_by_user(sk))
379 		NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
380 
381 	if (sk->sk_state == TCP_CLOSE)
382 		goto out;
383 
384 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
385 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
386 		goto out;
387 	}
388 
389 	icsk = inet_csk(sk);
390 	tp = tcp_sk(sk);
391 	seq = ntohl(th->seq);
392 	if (sk->sk_state != TCP_LISTEN &&
393 	    !between(seq, tp->snd_una, tp->snd_nxt)) {
394 		NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
395 		goto out;
396 	}
397 
398 	switch (type) {
399 	case ICMP_SOURCE_QUENCH:
400 		/* Just silently ignore these. */
401 		goto out;
402 	case ICMP_PARAMETERPROB:
403 		err = EPROTO;
404 		break;
405 	case ICMP_DEST_UNREACH:
406 		if (code > NR_ICMP_UNREACH)
407 			goto out;
408 
409 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
410 			if (!sock_owned_by_user(sk))
411 				do_pmtu_discovery(sk, iph, info);
412 			goto out;
413 		}
414 
415 		err = icmp_err_convert[code].errno;
416 		/* check if icmp_skb allows revert of backoff
417 		 * (see draft-zimmermann-tcp-lcd) */
418 		if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
419 			break;
420 		if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
421 		    !icsk->icsk_backoff)
422 			break;
423 
424 		if (sock_owned_by_user(sk))
425 			break;
426 
427 		icsk->icsk_backoff--;
428 		inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
429 					 icsk->icsk_backoff;
430 		tcp_bound_rto(sk);
431 
432 		skb = tcp_write_queue_head(sk);
433 		BUG_ON(!skb);
434 
435 		remaining = icsk->icsk_rto - min(icsk->icsk_rto,
436 				tcp_time_stamp - TCP_SKB_CB(skb)->when);
437 
438 		if (remaining) {
439 			inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
440 						  remaining, TCP_RTO_MAX);
441 		} else {
442 			/* RTO revert clocked out retransmission.
443 			 * Will retransmit now */
444 			tcp_retransmit_timer(sk);
445 		}
446 
447 		break;
448 	case ICMP_TIME_EXCEEDED:
449 		err = EHOSTUNREACH;
450 		break;
451 	default:
452 		goto out;
453 	}
454 
455 	switch (sk->sk_state) {
456 		struct request_sock *req, **prev;
457 	case TCP_LISTEN:
458 		if (sock_owned_by_user(sk))
459 			goto out;
460 
461 		req = inet_csk_search_req(sk, &prev, th->dest,
462 					  iph->daddr, iph->saddr);
463 		if (!req)
464 			goto out;
465 
466 		/* ICMPs are not backlogged, hence we cannot get
467 		   an established socket here.
468 		 */
469 		WARN_ON(req->sk);
470 
471 		if (seq != tcp_rsk(req)->snt_isn) {
472 			NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
473 			goto out;
474 		}
475 
476 		/*
477 		 * Still in SYN_RECV, just remove it silently.
478 		 * There is no good way to pass the error to the newly
479 		 * created socket, and POSIX does not want network
480 		 * errors returned from accept().
481 		 */
482 		inet_csk_reqsk_queue_drop(sk, req, prev);
483 		goto out;
484 
485 	case TCP_SYN_SENT:
486 	case TCP_SYN_RECV:  /* Cannot happen.
487 			       It can f.e. if SYNs crossed.
488 			     */
489 		if (!sock_owned_by_user(sk)) {
490 			sk->sk_err = err;
491 
492 			sk->sk_error_report(sk);
493 
494 			tcp_done(sk);
495 		} else {
496 			sk->sk_err_soft = err;
497 		}
498 		goto out;
499 	}
500 
501 	/* If we've already connected we will keep trying
502 	 * until we time out, or the user gives up.
503 	 *
504 	 * rfc1122 4.2.3.9 allows to consider as hard errors
505 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
506 	 * but it is obsoleted by pmtu discovery).
507 	 *
508 	 * Note, that in modern internet, where routing is unreliable
509 	 * and in each dark corner broken firewalls sit, sending random
510 	 * errors ordered by their masters even this two messages finally lose
511 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
512 	 *
513 	 * Now we are in compliance with RFCs.
514 	 *							--ANK (980905)
515 	 */
516 
517 	inet = inet_sk(sk);
518 	if (!sock_owned_by_user(sk) && inet->recverr) {
519 		sk->sk_err = err;
520 		sk->sk_error_report(sk);
521 	} else	{ /* Only an error on timeout */
522 		sk->sk_err_soft = err;
523 	}
524 
525 out:
526 	bh_unlock_sock(sk);
527 	sock_put(sk);
528 }
529 
__tcp_v4_send_check(struct sk_buff * skb,__be32 saddr,__be32 daddr)530 static void __tcp_v4_send_check(struct sk_buff *skb,
531 				__be32 saddr, __be32 daddr)
532 {
533 	struct tcphdr *th = tcp_hdr(skb);
534 
535 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
536 		th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
537 		skb->csum_start = skb_transport_header(skb) - skb->head;
538 		skb->csum_offset = offsetof(struct tcphdr, check);
539 	} else {
540 		th->check = tcp_v4_check(skb->len, saddr, daddr,
541 					 csum_partial(th,
542 						      th->doff << 2,
543 						      skb->csum));
544 	}
545 }
546 
547 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct sk_buff * skb)548 void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
549 {
550 	struct inet_sock *inet = inet_sk(sk);
551 
552 	__tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
553 }
554 EXPORT_SYMBOL(tcp_v4_send_check);
555 
tcp_v4_gso_send_check(struct sk_buff * skb)556 int tcp_v4_gso_send_check(struct sk_buff *skb)
557 {
558 	const struct iphdr *iph;
559 	struct tcphdr *th;
560 
561 	if (!pskb_may_pull(skb, sizeof(*th)))
562 		return -EINVAL;
563 
564 	iph = ip_hdr(skb);
565 	th = tcp_hdr(skb);
566 
567 	th->check = 0;
568 	skb->ip_summed = CHECKSUM_PARTIAL;
569 	__tcp_v4_send_check(skb, iph->saddr, iph->daddr);
570 	return 0;
571 }
572 
573 /*
574  *	This routine will send an RST to the other tcp.
575  *
576  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
577  *		      for reset.
578  *	Answer: if a packet caused RST, it is not for a socket
579  *		existing in our system, if it is matched to a socket,
580  *		it is just duplicate segment or bug in other side's TCP.
581  *		So that we build reply only basing on parameters
582  *		arrived with segment.
583  *	Exception: precedence violation. We do not implement it in any case.
584  */
585 
tcp_v4_send_reset(struct sock * sk,struct sk_buff * skb)586 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
587 {
588 	struct tcphdr *th = tcp_hdr(skb);
589 	struct {
590 		struct tcphdr th;
591 #ifdef CONFIG_TCP_MD5SIG
592 		__be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
593 #endif
594 	} rep;
595 	struct ip_reply_arg arg;
596 #ifdef CONFIG_TCP_MD5SIG
597 	struct tcp_md5sig_key *key;
598 #endif
599 	struct net *net;
600 
601 	/* Never send a reset in response to a reset. */
602 	if (th->rst)
603 		return;
604 
605 	if (skb_rtable(skb)->rt_type != RTN_LOCAL)
606 		return;
607 
608 	/* Swap the send and the receive. */
609 	memset(&rep, 0, sizeof(rep));
610 	rep.th.dest   = th->source;
611 	rep.th.source = th->dest;
612 	rep.th.doff   = sizeof(struct tcphdr) / 4;
613 	rep.th.rst    = 1;
614 
615 	if (th->ack) {
616 		rep.th.seq = th->ack_seq;
617 	} else {
618 		rep.th.ack = 1;
619 		rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
620 				       skb->len - (th->doff << 2));
621 	}
622 
623 	memset(&arg, 0, sizeof(arg));
624 	arg.iov[0].iov_base = (unsigned char *)&rep;
625 	arg.iov[0].iov_len  = sizeof(rep.th);
626 
627 #ifdef CONFIG_TCP_MD5SIG
628 	key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
629 	if (key) {
630 		rep.opt[0] = htonl((TCPOPT_NOP << 24) |
631 				   (TCPOPT_NOP << 16) |
632 				   (TCPOPT_MD5SIG << 8) |
633 				   TCPOLEN_MD5SIG);
634 		/* Update length and the length the header thinks exists */
635 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
636 		rep.th.doff = arg.iov[0].iov_len / 4;
637 
638 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
639 				     key, ip_hdr(skb)->saddr,
640 				     ip_hdr(skb)->daddr, &rep.th);
641 	}
642 #endif
643 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
644 				      ip_hdr(skb)->saddr, /* XXX */
645 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
646 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
647 	arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
648 
649 	net = dev_net(skb_dst(skb)->dev);
650 	ip_send_reply(net->ipv4.tcp_sock, skb,
651 		      &arg, arg.iov[0].iov_len);
652 
653 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
654 	TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
655 }
656 
657 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
658    outside socket context is ugly, certainly. What can I do?
659  */
660 
tcp_v4_send_ack(struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 ts,int oif,struct tcp_md5sig_key * key,int reply_flags)661 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
662 			    u32 win, u32 ts, int oif,
663 			    struct tcp_md5sig_key *key,
664 			    int reply_flags)
665 {
666 	struct tcphdr *th = tcp_hdr(skb);
667 	struct {
668 		struct tcphdr th;
669 		__be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
670 #ifdef CONFIG_TCP_MD5SIG
671 			   + (TCPOLEN_MD5SIG_ALIGNED >> 2)
672 #endif
673 			];
674 	} rep;
675 	struct ip_reply_arg arg;
676 	struct net *net = dev_net(skb_dst(skb)->dev);
677 
678 	memset(&rep.th, 0, sizeof(struct tcphdr));
679 	memset(&arg, 0, sizeof(arg));
680 
681 	arg.iov[0].iov_base = (unsigned char *)&rep;
682 	arg.iov[0].iov_len  = sizeof(rep.th);
683 	if (ts) {
684 		rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
685 				   (TCPOPT_TIMESTAMP << 8) |
686 				   TCPOLEN_TIMESTAMP);
687 		rep.opt[1] = htonl(tcp_time_stamp);
688 		rep.opt[2] = htonl(ts);
689 		arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
690 	}
691 
692 	/* Swap the send and the receive. */
693 	rep.th.dest    = th->source;
694 	rep.th.source  = th->dest;
695 	rep.th.doff    = arg.iov[0].iov_len / 4;
696 	rep.th.seq     = htonl(seq);
697 	rep.th.ack_seq = htonl(ack);
698 	rep.th.ack     = 1;
699 	rep.th.window  = htons(win);
700 
701 #ifdef CONFIG_TCP_MD5SIG
702 	if (key) {
703 		int offset = (ts) ? 3 : 0;
704 
705 		rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
706 					  (TCPOPT_NOP << 16) |
707 					  (TCPOPT_MD5SIG << 8) |
708 					  TCPOLEN_MD5SIG);
709 		arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
710 		rep.th.doff = arg.iov[0].iov_len/4;
711 
712 		tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
713 				    key, ip_hdr(skb)->saddr,
714 				    ip_hdr(skb)->daddr, &rep.th);
715 	}
716 #endif
717 	arg.flags = reply_flags;
718 	arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
719 				      ip_hdr(skb)->saddr, /* XXX */
720 				      arg.iov[0].iov_len, IPPROTO_TCP, 0);
721 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
722 	if (oif)
723 		arg.bound_dev_if = oif;
724 
725 	ip_send_reply(net->ipv4.tcp_sock, skb,
726 		      &arg, arg.iov[0].iov_len);
727 
728 	TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
729 }
730 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)731 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
732 {
733 	struct inet_timewait_sock *tw = inet_twsk(sk);
734 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
735 
736 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
737 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
738 			tcptw->tw_ts_recent,
739 			tw->tw_bound_dev_if,
740 			tcp_twsk_md5_key(tcptw),
741 			tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
742 			);
743 
744 	inet_twsk_put(tw);
745 }
746 
tcp_v4_reqsk_send_ack(struct sock * sk,struct sk_buff * skb,struct request_sock * req)747 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
748 				  struct request_sock *req)
749 {
750 	tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
751 			tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
752 			req->ts_recent,
753 			0,
754 			tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
755 			inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
756 }
757 
758 /*
759  *	Send a SYN-ACK after having received a SYN.
760  *	This still operates on a request_sock only, not on a big
761  *	socket.
762  */
tcp_v4_send_synack(struct sock * sk,struct dst_entry * dst,struct request_sock * req,struct request_values * rvp)763 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
764 			      struct request_sock *req,
765 			      struct request_values *rvp)
766 {
767 	const struct inet_request_sock *ireq = inet_rsk(req);
768 	int err = -1;
769 	struct sk_buff * skb;
770 
771 	/* First, grab a route. */
772 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
773 		return -1;
774 
775 	skb = tcp_make_synack(sk, dst, req, rvp);
776 
777 	if (skb) {
778 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
779 
780 		err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
781 					    ireq->rmt_addr,
782 					    ireq->opt);
783 		err = net_xmit_eval(err);
784 	}
785 
786 	dst_release(dst);
787 	return err;
788 }
789 
tcp_v4_rtx_synack(struct sock * sk,struct request_sock * req,struct request_values * rvp)790 static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
791 			      struct request_values *rvp)
792 {
793 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
794 	return tcp_v4_send_synack(sk, NULL, req, rvp);
795 }
796 
797 /*
798  *	IPv4 request_sock destructor.
799  */
tcp_v4_reqsk_destructor(struct request_sock * req)800 static void tcp_v4_reqsk_destructor(struct request_sock *req)
801 {
802 	kfree(inet_rsk(req)->opt);
803 }
804 
syn_flood_warning(const struct sk_buff * skb)805 static void syn_flood_warning(const struct sk_buff *skb)
806 {
807 	const char *msg;
808 
809 #ifdef CONFIG_SYN_COOKIES
810 	if (sysctl_tcp_syncookies)
811 		msg = "Sending cookies";
812 	else
813 #endif
814 		msg = "Dropping request";
815 
816 	pr_info("TCP: Possible SYN flooding on port %d. %s.\n",
817 				ntohs(tcp_hdr(skb)->dest), msg);
818 }
819 
820 /*
821  * Save and compile IPv4 options into the request_sock if needed.
822  */
tcp_v4_save_options(struct sock * sk,struct sk_buff * skb)823 static struct ip_options *tcp_v4_save_options(struct sock *sk,
824 					      struct sk_buff *skb)
825 {
826 	struct ip_options *opt = &(IPCB(skb)->opt);
827 	struct ip_options *dopt = NULL;
828 
829 	if (opt && opt->optlen) {
830 		int opt_size = optlength(opt);
831 		dopt = kmalloc(opt_size, GFP_ATOMIC);
832 		if (dopt) {
833 			if (ip_options_echo(dopt, skb)) {
834 				kfree(dopt);
835 				dopt = NULL;
836 			}
837 		}
838 	}
839 	return dopt;
840 }
841 
842 #ifdef CONFIG_TCP_MD5SIG
843 /*
844  * RFC2385 MD5 checksumming requires a mapping of
845  * IP address->MD5 Key.
846  * We need to maintain these in the sk structure.
847  */
848 
849 /* Find the Key structure for an address.  */
850 static struct tcp_md5sig_key *
tcp_v4_md5_do_lookup(struct sock * sk,__be32 addr)851 			tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
852 {
853 	struct tcp_sock *tp = tcp_sk(sk);
854 	int i;
855 
856 	if (!tp->md5sig_info || !tp->md5sig_info->entries4)
857 		return NULL;
858 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
859 		if (tp->md5sig_info->keys4[i].addr == addr)
860 			return &tp->md5sig_info->keys4[i].base;
861 	}
862 	return NULL;
863 }
864 
tcp_v4_md5_lookup(struct sock * sk,struct sock * addr_sk)865 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
866 					 struct sock *addr_sk)
867 {
868 	return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
869 }
870 EXPORT_SYMBOL(tcp_v4_md5_lookup);
871 
tcp_v4_reqsk_md5_lookup(struct sock * sk,struct request_sock * req)872 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
873 						      struct request_sock *req)
874 {
875 	return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
876 }
877 
878 /* This can be called on a newly created socket, from other files */
tcp_v4_md5_do_add(struct sock * sk,__be32 addr,u8 * newkey,u8 newkeylen)879 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
880 		      u8 *newkey, u8 newkeylen)
881 {
882 	/* Add Key to the list */
883 	struct tcp_md5sig_key *key;
884 	struct tcp_sock *tp = tcp_sk(sk);
885 	struct tcp4_md5sig_key *keys;
886 
887 	key = tcp_v4_md5_do_lookup(sk, addr);
888 	if (key) {
889 		/* Pre-existing entry - just update that one. */
890 		kfree(key->key);
891 		key->key = newkey;
892 		key->keylen = newkeylen;
893 	} else {
894 		struct tcp_md5sig_info *md5sig;
895 
896 		if (!tp->md5sig_info) {
897 			tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
898 						  GFP_ATOMIC);
899 			if (!tp->md5sig_info) {
900 				kfree(newkey);
901 				return -ENOMEM;
902 			}
903 			sk_nocaps_add(sk, NETIF_F_GSO_MASK);
904 		}
905 		if (tcp_alloc_md5sig_pool(sk) == NULL) {
906 			kfree(newkey);
907 			return -ENOMEM;
908 		}
909 		md5sig = tp->md5sig_info;
910 
911 		if (md5sig->alloced4 == md5sig->entries4) {
912 			keys = kmalloc((sizeof(*keys) *
913 					(md5sig->entries4 + 1)), GFP_ATOMIC);
914 			if (!keys) {
915 				kfree(newkey);
916 				tcp_free_md5sig_pool();
917 				return -ENOMEM;
918 			}
919 
920 			if (md5sig->entries4)
921 				memcpy(keys, md5sig->keys4,
922 				       sizeof(*keys) * md5sig->entries4);
923 
924 			/* Free old key list, and reference new one */
925 			kfree(md5sig->keys4);
926 			md5sig->keys4 = keys;
927 			md5sig->alloced4++;
928 		}
929 		md5sig->entries4++;
930 		md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
931 		md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
932 		md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
933 	}
934 	return 0;
935 }
936 EXPORT_SYMBOL(tcp_v4_md5_do_add);
937 
tcp_v4_md5_add_func(struct sock * sk,struct sock * addr_sk,u8 * newkey,u8 newkeylen)938 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
939 			       u8 *newkey, u8 newkeylen)
940 {
941 	return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
942 				 newkey, newkeylen);
943 }
944 
tcp_v4_md5_do_del(struct sock * sk,__be32 addr)945 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
946 {
947 	struct tcp_sock *tp = tcp_sk(sk);
948 	int i;
949 
950 	for (i = 0; i < tp->md5sig_info->entries4; i++) {
951 		if (tp->md5sig_info->keys4[i].addr == addr) {
952 			/* Free the key */
953 			kfree(tp->md5sig_info->keys4[i].base.key);
954 			tp->md5sig_info->entries4--;
955 
956 			if (tp->md5sig_info->entries4 == 0) {
957 				kfree(tp->md5sig_info->keys4);
958 				tp->md5sig_info->keys4 = NULL;
959 				tp->md5sig_info->alloced4 = 0;
960 			} else if (tp->md5sig_info->entries4 != i) {
961 				/* Need to do some manipulation */
962 				memmove(&tp->md5sig_info->keys4[i],
963 					&tp->md5sig_info->keys4[i+1],
964 					(tp->md5sig_info->entries4 - i) *
965 					 sizeof(struct tcp4_md5sig_key));
966 			}
967 			tcp_free_md5sig_pool();
968 			return 0;
969 		}
970 	}
971 	return -ENOENT;
972 }
973 EXPORT_SYMBOL(tcp_v4_md5_do_del);
974 
tcp_v4_clear_md5_list(struct sock * sk)975 static void tcp_v4_clear_md5_list(struct sock *sk)
976 {
977 	struct tcp_sock *tp = tcp_sk(sk);
978 
979 	/* Free each key, then the set of key keys,
980 	 * the crypto element, and then decrement our
981 	 * hold on the last resort crypto.
982 	 */
983 	if (tp->md5sig_info->entries4) {
984 		int i;
985 		for (i = 0; i < tp->md5sig_info->entries4; i++)
986 			kfree(tp->md5sig_info->keys4[i].base.key);
987 		tp->md5sig_info->entries4 = 0;
988 		tcp_free_md5sig_pool();
989 	}
990 	if (tp->md5sig_info->keys4) {
991 		kfree(tp->md5sig_info->keys4);
992 		tp->md5sig_info->keys4 = NULL;
993 		tp->md5sig_info->alloced4  = 0;
994 	}
995 }
996 
tcp_v4_parse_md5_keys(struct sock * sk,char __user * optval,int optlen)997 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
998 				 int optlen)
999 {
1000 	struct tcp_md5sig cmd;
1001 	struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
1002 	u8 *newkey;
1003 
1004 	if (optlen < sizeof(cmd))
1005 		return -EINVAL;
1006 
1007 	if (copy_from_user(&cmd, optval, sizeof(cmd)))
1008 		return -EFAULT;
1009 
1010 	if (sin->sin_family != AF_INET)
1011 		return -EINVAL;
1012 
1013 	if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1014 		if (!tcp_sk(sk)->md5sig_info)
1015 			return -ENOENT;
1016 		return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1017 	}
1018 
1019 	if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1020 		return -EINVAL;
1021 
1022 	if (!tcp_sk(sk)->md5sig_info) {
1023 		struct tcp_sock *tp = tcp_sk(sk);
1024 		struct tcp_md5sig_info *p;
1025 
1026 		p = kzalloc(sizeof(*p), sk->sk_allocation);
1027 		if (!p)
1028 			return -EINVAL;
1029 
1030 		tp->md5sig_info = p;
1031 		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
1032 	}
1033 
1034 	newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1035 	if (!newkey)
1036 		return -ENOMEM;
1037 	return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1038 				 newkey, cmd.tcpm_keylen);
1039 }
1040 
tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool * hp,__be32 daddr,__be32 saddr,int nbytes)1041 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1042 					__be32 daddr, __be32 saddr, int nbytes)
1043 {
1044 	struct tcp4_pseudohdr *bp;
1045 	struct scatterlist sg;
1046 
1047 	bp = &hp->md5_blk.ip4;
1048 
1049 	/*
1050 	 * 1. the TCP pseudo-header (in the order: source IP address,
1051 	 * destination IP address, zero-padded protocol number, and
1052 	 * segment length)
1053 	 */
1054 	bp->saddr = saddr;
1055 	bp->daddr = daddr;
1056 	bp->pad = 0;
1057 	bp->protocol = IPPROTO_TCP;
1058 	bp->len = cpu_to_be16(nbytes);
1059 
1060 	sg_init_one(&sg, bp, sizeof(*bp));
1061 	return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1062 }
1063 
tcp_v4_md5_hash_hdr(char * md5_hash,struct tcp_md5sig_key * key,__be32 daddr,__be32 saddr,struct tcphdr * th)1064 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1065 			       __be32 daddr, __be32 saddr, struct tcphdr *th)
1066 {
1067 	struct tcp_md5sig_pool *hp;
1068 	struct hash_desc *desc;
1069 
1070 	hp = tcp_get_md5sig_pool();
1071 	if (!hp)
1072 		goto clear_hash_noput;
1073 	desc = &hp->md5_desc;
1074 
1075 	if (crypto_hash_init(desc))
1076 		goto clear_hash;
1077 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1078 		goto clear_hash;
1079 	if (tcp_md5_hash_header(hp, th))
1080 		goto clear_hash;
1081 	if (tcp_md5_hash_key(hp, key))
1082 		goto clear_hash;
1083 	if (crypto_hash_final(desc, md5_hash))
1084 		goto clear_hash;
1085 
1086 	tcp_put_md5sig_pool();
1087 	return 0;
1088 
1089 clear_hash:
1090 	tcp_put_md5sig_pool();
1091 clear_hash_noput:
1092 	memset(md5_hash, 0, 16);
1093 	return 1;
1094 }
1095 
tcp_v4_md5_hash_skb(char * md5_hash,struct tcp_md5sig_key * key,struct sock * sk,struct request_sock * req,struct sk_buff * skb)1096 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1097 			struct sock *sk, struct request_sock *req,
1098 			struct sk_buff *skb)
1099 {
1100 	struct tcp_md5sig_pool *hp;
1101 	struct hash_desc *desc;
1102 	struct tcphdr *th = tcp_hdr(skb);
1103 	__be32 saddr, daddr;
1104 
1105 	if (sk) {
1106 		saddr = inet_sk(sk)->inet_saddr;
1107 		daddr = inet_sk(sk)->inet_daddr;
1108 	} else if (req) {
1109 		saddr = inet_rsk(req)->loc_addr;
1110 		daddr = inet_rsk(req)->rmt_addr;
1111 	} else {
1112 		const struct iphdr *iph = ip_hdr(skb);
1113 		saddr = iph->saddr;
1114 		daddr = iph->daddr;
1115 	}
1116 
1117 	hp = tcp_get_md5sig_pool();
1118 	if (!hp)
1119 		goto clear_hash_noput;
1120 	desc = &hp->md5_desc;
1121 
1122 	if (crypto_hash_init(desc))
1123 		goto clear_hash;
1124 
1125 	if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1126 		goto clear_hash;
1127 	if (tcp_md5_hash_header(hp, th))
1128 		goto clear_hash;
1129 	if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1130 		goto clear_hash;
1131 	if (tcp_md5_hash_key(hp, key))
1132 		goto clear_hash;
1133 	if (crypto_hash_final(desc, md5_hash))
1134 		goto clear_hash;
1135 
1136 	tcp_put_md5sig_pool();
1137 	return 0;
1138 
1139 clear_hash:
1140 	tcp_put_md5sig_pool();
1141 clear_hash_noput:
1142 	memset(md5_hash, 0, 16);
1143 	return 1;
1144 }
1145 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1146 
tcp_v4_inbound_md5_hash(struct sock * sk,struct sk_buff * skb)1147 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1148 {
1149 	/*
1150 	 * This gets called for each TCP segment that arrives
1151 	 * so we want to be efficient.
1152 	 * We have 3 drop cases:
1153 	 * o No MD5 hash and one expected.
1154 	 * o MD5 hash and we're not expecting one.
1155 	 * o MD5 hash and its wrong.
1156 	 */
1157 	__u8 *hash_location = NULL;
1158 	struct tcp_md5sig_key *hash_expected;
1159 	const struct iphdr *iph = ip_hdr(skb);
1160 	struct tcphdr *th = tcp_hdr(skb);
1161 	int genhash;
1162 	unsigned char newhash[16];
1163 
1164 	hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1165 	hash_location = tcp_parse_md5sig_option(th);
1166 
1167 	/* We've parsed the options - do we have a hash? */
1168 	if (!hash_expected && !hash_location)
1169 		return 0;
1170 
1171 	if (hash_expected && !hash_location) {
1172 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1173 		return 1;
1174 	}
1175 
1176 	if (!hash_expected && hash_location) {
1177 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1178 		return 1;
1179 	}
1180 
1181 	/* Okay, so this is hash_expected and hash_location -
1182 	 * so we need to calculate the checksum.
1183 	 */
1184 	genhash = tcp_v4_md5_hash_skb(newhash,
1185 				      hash_expected,
1186 				      NULL, NULL, skb);
1187 
1188 	if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1189 		if (net_ratelimit()) {
1190 			printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1191 			       &iph->saddr, ntohs(th->source),
1192 			       &iph->daddr, ntohs(th->dest),
1193 			       genhash ? " tcp_v4_calc_md5_hash failed" : "");
1194 		}
1195 		return 1;
1196 	}
1197 	return 0;
1198 }
1199 
1200 #endif
1201 
1202 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1203 	.family		=	PF_INET,
1204 	.obj_size	=	sizeof(struct tcp_request_sock),
1205 	.rtx_syn_ack	=	tcp_v4_rtx_synack,
1206 	.send_ack	=	tcp_v4_reqsk_send_ack,
1207 	.destructor	=	tcp_v4_reqsk_destructor,
1208 	.send_reset	=	tcp_v4_send_reset,
1209 	.syn_ack_timeout = 	tcp_syn_ack_timeout,
1210 };
1211 
1212 #ifdef CONFIG_TCP_MD5SIG
1213 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1214 	.md5_lookup	=	tcp_v4_reqsk_md5_lookup,
1215 	.calc_md5_hash	=	tcp_v4_md5_hash_skb,
1216 };
1217 #endif
1218 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1219 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1220 {
1221 	struct tcp_extend_values tmp_ext;
1222 	struct tcp_options_received tmp_opt;
1223 	u8 *hash_location;
1224 	struct request_sock *req;
1225 	struct inet_request_sock *ireq;
1226 	struct tcp_sock *tp = tcp_sk(sk);
1227 	struct dst_entry *dst = NULL;
1228 	__be32 saddr = ip_hdr(skb)->saddr;
1229 	__be32 daddr = ip_hdr(skb)->daddr;
1230 	__u32 isn = TCP_SKB_CB(skb)->when;
1231 #ifdef CONFIG_SYN_COOKIES
1232 	int want_cookie = 0;
1233 #else
1234 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1235 #endif
1236 
1237 	/* Never answer to SYNs send to broadcast or multicast */
1238 	if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1239 		goto drop;
1240 
1241 	/* TW buckets are converted to open requests without
1242 	 * limitations, they conserve resources and peer is
1243 	 * evidently real one.
1244 	 */
1245 	if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1246 		if (net_ratelimit())
1247 			syn_flood_warning(skb);
1248 #ifdef CONFIG_SYN_COOKIES
1249 		if (sysctl_tcp_syncookies) {
1250 			want_cookie = 1;
1251 		} else
1252 #endif
1253 		goto drop;
1254 	}
1255 
1256 	/* Accept backlog is full. If we have already queued enough
1257 	 * of warm entries in syn queue, drop request. It is better than
1258 	 * clogging syn queue with openreqs with exponentially increasing
1259 	 * timeout.
1260 	 */
1261 	if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1262 		goto drop;
1263 
1264 	req = inet_reqsk_alloc(&tcp_request_sock_ops);
1265 	if (!req)
1266 		goto drop;
1267 
1268 #ifdef CONFIG_TCP_MD5SIG
1269 	tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1270 #endif
1271 
1272 	tcp_clear_options(&tmp_opt);
1273 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1274 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
1275 	tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
1276 
1277 	if (tmp_opt.cookie_plus > 0 &&
1278 	    tmp_opt.saw_tstamp &&
1279 	    !tp->rx_opt.cookie_out_never &&
1280 	    (sysctl_tcp_cookie_size > 0 ||
1281 	     (tp->cookie_values != NULL &&
1282 	      tp->cookie_values->cookie_desired > 0))) {
1283 		u8 *c;
1284 		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
1285 		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
1286 
1287 		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
1288 			goto drop_and_release;
1289 
1290 		/* Secret recipe starts with IP addresses */
1291 		*mess++ ^= (__force u32)daddr;
1292 		*mess++ ^= (__force u32)saddr;
1293 
1294 		/* plus variable length Initiator Cookie */
1295 		c = (u8 *)mess;
1296 		while (l-- > 0)
1297 			*c++ ^= *hash_location++;
1298 
1299 #ifdef CONFIG_SYN_COOKIES
1300 		want_cookie = 0;	/* not our kind of cookie */
1301 #endif
1302 		tmp_ext.cookie_out_never = 0; /* false */
1303 		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
1304 	} else if (!tp->rx_opt.cookie_in_always) {
1305 		/* redundant indications, but ensure initialization. */
1306 		tmp_ext.cookie_out_never = 1; /* true */
1307 		tmp_ext.cookie_plus = 0;
1308 	} else {
1309 		goto drop_and_release;
1310 	}
1311 	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
1312 
1313 	if (want_cookie && !tmp_opt.saw_tstamp)
1314 		tcp_clear_options(&tmp_opt);
1315 
1316 	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1317 	tcp_openreq_init(req, &tmp_opt, skb);
1318 
1319 	ireq = inet_rsk(req);
1320 	ireq->loc_addr = daddr;
1321 	ireq->rmt_addr = saddr;
1322 	ireq->no_srccheck = inet_sk(sk)->transparent;
1323 	ireq->opt = tcp_v4_save_options(sk, skb);
1324 
1325 	if (security_inet_conn_request(sk, skb, req))
1326 		goto drop_and_free;
1327 
1328 	if (!want_cookie || tmp_opt.tstamp_ok)
1329 		TCP_ECN_create_request(req, tcp_hdr(skb));
1330 
1331 	if (want_cookie) {
1332 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1333 		req->cookie_ts = tmp_opt.tstamp_ok;
1334 	} else if (!isn) {
1335 		struct inet_peer *peer = NULL;
1336 
1337 		/* VJ's idea. We save last timestamp seen
1338 		 * from the destination in peer table, when entering
1339 		 * state TIME-WAIT, and check against it before
1340 		 * accepting new connection request.
1341 		 *
1342 		 * If "isn" is not zero, this request hit alive
1343 		 * timewait bucket, so that all the necessary checks
1344 		 * are made in the function processing timewait state.
1345 		 */
1346 		if (tmp_opt.saw_tstamp &&
1347 		    tcp_death_row.sysctl_tw_recycle &&
1348 		    (dst = inet_csk_route_req(sk, req)) != NULL &&
1349 		    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1350 		    peer->daddr.addr.a4 == saddr) {
1351 			inet_peer_refcheck(peer);
1352 			if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1353 			    (s32)(peer->tcp_ts - req->ts_recent) >
1354 							TCP_PAWS_WINDOW) {
1355 				NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1356 				goto drop_and_release;
1357 			}
1358 		}
1359 		/* Kill the following clause, if you dislike this way. */
1360 		else if (!sysctl_tcp_syncookies &&
1361 			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1362 			  (sysctl_max_syn_backlog >> 2)) &&
1363 			 (!peer || !peer->tcp_ts_stamp) &&
1364 			 (!dst || !dst_metric(dst, RTAX_RTT))) {
1365 			/* Without syncookies last quarter of
1366 			 * backlog is filled with destinations,
1367 			 * proven to be alive.
1368 			 * It means that we continue to communicate
1369 			 * to destinations, already remembered
1370 			 * to the moment of synflood.
1371 			 */
1372 			LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1373 				       &saddr, ntohs(tcp_hdr(skb)->source));
1374 			goto drop_and_release;
1375 		}
1376 
1377 		isn = tcp_v4_init_sequence(skb);
1378 	}
1379 	tcp_rsk(req)->snt_isn = isn;
1380 
1381 	if (tcp_v4_send_synack(sk, dst, req,
1382 			       (struct request_values *)&tmp_ext) ||
1383 	    want_cookie)
1384 		goto drop_and_free;
1385 
1386 	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1387 	return 0;
1388 
1389 drop_and_release:
1390 	dst_release(dst);
1391 drop_and_free:
1392 	reqsk_free(req);
1393 drop:
1394 	return 0;
1395 }
1396 EXPORT_SYMBOL(tcp_v4_conn_request);
1397 
1398 
1399 /*
1400  * The three way handshake has completed - we got a valid synack -
1401  * now create the new socket.
1402  */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct request_sock * req,struct dst_entry * dst)1403 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1404 				  struct request_sock *req,
1405 				  struct dst_entry *dst)
1406 {
1407 	struct inet_request_sock *ireq;
1408 	struct inet_sock *newinet;
1409 	struct tcp_sock *newtp;
1410 	struct sock *newsk;
1411 #ifdef CONFIG_TCP_MD5SIG
1412 	struct tcp_md5sig_key *key;
1413 #endif
1414 
1415 	if (sk_acceptq_is_full(sk))
1416 		goto exit_overflow;
1417 
1418 	if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1419 		goto exit;
1420 
1421 	newsk = tcp_create_openreq_child(sk, req, skb);
1422 	if (!newsk)
1423 		goto exit_nonewsk;
1424 
1425 	newsk->sk_gso_type = SKB_GSO_TCPV4;
1426 	sk_setup_caps(newsk, dst);
1427 
1428 	newtp		      = tcp_sk(newsk);
1429 	newinet		      = inet_sk(newsk);
1430 	ireq		      = inet_rsk(req);
1431 	newinet->inet_daddr   = ireq->rmt_addr;
1432 	newinet->inet_rcv_saddr = ireq->loc_addr;
1433 	newinet->inet_saddr	      = ireq->loc_addr;
1434 	newinet->opt	      = ireq->opt;
1435 	ireq->opt	      = NULL;
1436 	newinet->mc_index     = inet_iif(skb);
1437 	newinet->mc_ttl	      = ip_hdr(skb)->ttl;
1438 	inet_csk(newsk)->icsk_ext_hdr_len = 0;
1439 	if (newinet->opt)
1440 		inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1441 	newinet->inet_id = newtp->write_seq ^ jiffies;
1442 
1443 	tcp_mtup_init(newsk);
1444 	tcp_sync_mss(newsk, dst_mtu(dst));
1445 	newtp->advmss = dst_metric_advmss(dst);
1446 	if (tcp_sk(sk)->rx_opt.user_mss &&
1447 	    tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1448 		newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1449 
1450 	tcp_initialize_rcv_mss(newsk);
1451 
1452 #ifdef CONFIG_TCP_MD5SIG
1453 	/* Copy over the MD5 key from the original socket */
1454 	key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1455 	if (key != NULL) {
1456 		/*
1457 		 * We're using one, so create a matching key
1458 		 * on the newsk structure. If we fail to get
1459 		 * memory, then we end up not copying the key
1460 		 * across. Shucks.
1461 		 */
1462 		char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1463 		if (newkey != NULL)
1464 			tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1465 					  newkey, key->keylen);
1466 		sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
1467 	}
1468 #endif
1469 
1470 	if (__inet_inherit_port(sk, newsk) < 0) {
1471 		sock_put(newsk);
1472 		goto exit;
1473 	}
1474 	__inet_hash_nolisten(newsk, NULL);
1475 
1476 	return newsk;
1477 
1478 exit_overflow:
1479 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1480 exit_nonewsk:
1481 	dst_release(dst);
1482 exit:
1483 	NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1484 	return NULL;
1485 }
1486 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
1487 
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1488 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1489 {
1490 	struct tcphdr *th = tcp_hdr(skb);
1491 	const struct iphdr *iph = ip_hdr(skb);
1492 	struct sock *nsk;
1493 	struct request_sock **prev;
1494 	/* Find possible connection requests. */
1495 	struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1496 						       iph->saddr, iph->daddr);
1497 	if (req)
1498 		return tcp_check_req(sk, skb, req, prev);
1499 
1500 	nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1501 			th->source, iph->daddr, th->dest, inet_iif(skb));
1502 
1503 	if (nsk) {
1504 		if (nsk->sk_state != TCP_TIME_WAIT) {
1505 			bh_lock_sock(nsk);
1506 			return nsk;
1507 		}
1508 		inet_twsk_put(inet_twsk(nsk));
1509 		return NULL;
1510 	}
1511 
1512 #ifdef CONFIG_SYN_COOKIES
1513 	if (!th->syn)
1514 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1515 #endif
1516 	return sk;
1517 }
1518 
tcp_v4_checksum_init(struct sk_buff * skb)1519 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1520 {
1521 	const struct iphdr *iph = ip_hdr(skb);
1522 
1523 	if (skb->ip_summed == CHECKSUM_COMPLETE) {
1524 		if (!tcp_v4_check(skb->len, iph->saddr,
1525 				  iph->daddr, skb->csum)) {
1526 			skb->ip_summed = CHECKSUM_UNNECESSARY;
1527 			return 0;
1528 		}
1529 	}
1530 
1531 	skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1532 				       skb->len, IPPROTO_TCP, 0);
1533 
1534 	if (skb->len <= 76) {
1535 		return __skb_checksum_complete(skb);
1536 	}
1537 	return 0;
1538 }
1539 
1540 
1541 /* The socket must have it's spinlock held when we get
1542  * here.
1543  *
1544  * We have a potential double-lock case here, so even when
1545  * doing backlog processing we use the BH locking scheme.
1546  * This is because we cannot sleep with the original spinlock
1547  * held.
1548  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1549 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1550 {
1551 	struct sock *rsk;
1552 #ifdef CONFIG_TCP_MD5SIG
1553 	/*
1554 	 * We really want to reject the packet as early as possible
1555 	 * if:
1556 	 *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1557 	 *  o There is an MD5 option and we're not expecting one
1558 	 */
1559 	if (tcp_v4_inbound_md5_hash(sk, skb))
1560 		goto discard;
1561 #endif
1562 
1563 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1564 		sock_rps_save_rxhash(sk, skb->rxhash);
1565 		if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1566 			rsk = sk;
1567 			goto reset;
1568 		}
1569 		return 0;
1570 	}
1571 
1572 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1573 		goto csum_err;
1574 
1575 	if (sk->sk_state == TCP_LISTEN) {
1576 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1577 		if (!nsk)
1578 			goto discard;
1579 
1580 		if (nsk != sk) {
1581 			if (tcp_child_process(sk, nsk, skb)) {
1582 				rsk = nsk;
1583 				goto reset;
1584 			}
1585 			return 0;
1586 		}
1587 	} else
1588 		sock_rps_save_rxhash(sk, skb->rxhash);
1589 
1590 	if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1591 		rsk = sk;
1592 		goto reset;
1593 	}
1594 	return 0;
1595 
1596 reset:
1597 	tcp_v4_send_reset(rsk, skb);
1598 discard:
1599 	kfree_skb(skb);
1600 	/* Be careful here. If this function gets more complicated and
1601 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1602 	 * might be destroyed here. This current version compiles correctly,
1603 	 * but you have been warned.
1604 	 */
1605 	return 0;
1606 
1607 csum_err:
1608 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1609 	goto discard;
1610 }
1611 EXPORT_SYMBOL(tcp_v4_do_rcv);
1612 
1613 /*
1614  *	From tcp_input.c
1615  */
1616 
tcp_v4_rcv(struct sk_buff * skb)1617 int tcp_v4_rcv(struct sk_buff *skb)
1618 {
1619 	const struct iphdr *iph;
1620 	struct tcphdr *th;
1621 	struct sock *sk;
1622 	int ret;
1623 	struct net *net = dev_net(skb->dev);
1624 
1625 	if (skb->pkt_type != PACKET_HOST)
1626 		goto discard_it;
1627 
1628 	/* Count it even if it's bad */
1629 	TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1630 
1631 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1632 		goto discard_it;
1633 
1634 	th = tcp_hdr(skb);
1635 
1636 	if (th->doff < sizeof(struct tcphdr) / 4)
1637 		goto bad_packet;
1638 	if (!pskb_may_pull(skb, th->doff * 4))
1639 		goto discard_it;
1640 
1641 	/* An explanation is required here, I think.
1642 	 * Packet length and doff are validated by header prediction,
1643 	 * provided case of th->doff==0 is eliminated.
1644 	 * So, we defer the checks. */
1645 	if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1646 		goto bad_packet;
1647 
1648 	th = tcp_hdr(skb);
1649 	iph = ip_hdr(skb);
1650 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1651 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1652 				    skb->len - th->doff * 4);
1653 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1654 	TCP_SKB_CB(skb)->when	 = 0;
1655 	TCP_SKB_CB(skb)->flags	 = iph->tos;
1656 	TCP_SKB_CB(skb)->sacked	 = 0;
1657 
1658 	sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1659 	if (!sk)
1660 		goto no_tcp_socket;
1661 
1662 process:
1663 	if (sk->sk_state == TCP_TIME_WAIT)
1664 		goto do_time_wait;
1665 
1666 	if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
1667 		NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
1668 		goto discard_and_relse;
1669 	}
1670 
1671 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1672 		goto discard_and_relse;
1673 	nf_reset(skb);
1674 
1675 	if (sk_filter(sk, skb))
1676 		goto discard_and_relse;
1677 
1678 	skb->dev = NULL;
1679 
1680 	bh_lock_sock_nested(sk);
1681 	ret = 0;
1682 	if (!sock_owned_by_user(sk)) {
1683 #ifdef CONFIG_NET_DMA
1684 		struct tcp_sock *tp = tcp_sk(sk);
1685 		if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1686 			tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1687 		if (tp->ucopy.dma_chan)
1688 			ret = tcp_v4_do_rcv(sk, skb);
1689 		else
1690 #endif
1691 		{
1692 			if (!tcp_prequeue(sk, skb))
1693 				ret = tcp_v4_do_rcv(sk, skb);
1694 		}
1695 	} else if (unlikely(sk_add_backlog(sk, skb))) {
1696 		bh_unlock_sock(sk);
1697 		NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
1698 		goto discard_and_relse;
1699 	}
1700 	bh_unlock_sock(sk);
1701 
1702 	sock_put(sk);
1703 
1704 	return ret;
1705 
1706 no_tcp_socket:
1707 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1708 		goto discard_it;
1709 
1710 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1711 bad_packet:
1712 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1713 	} else {
1714 		tcp_v4_send_reset(NULL, skb);
1715 	}
1716 
1717 discard_it:
1718 	/* Discard frame. */
1719 	kfree_skb(skb);
1720 	return 0;
1721 
1722 discard_and_relse:
1723 	sock_put(sk);
1724 	goto discard_it;
1725 
1726 do_time_wait:
1727 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1728 		inet_twsk_put(inet_twsk(sk));
1729 		goto discard_it;
1730 	}
1731 
1732 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1733 		TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1734 		inet_twsk_put(inet_twsk(sk));
1735 		goto discard_it;
1736 	}
1737 	switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1738 	case TCP_TW_SYN: {
1739 		struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1740 							&tcp_hashinfo,
1741 							iph->daddr, th->dest,
1742 							inet_iif(skb));
1743 		if (sk2) {
1744 			inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1745 			inet_twsk_put(inet_twsk(sk));
1746 			sk = sk2;
1747 			goto process;
1748 		}
1749 		/* Fall through to ACK */
1750 	}
1751 	case TCP_TW_ACK:
1752 		tcp_v4_timewait_ack(sk, skb);
1753 		break;
1754 	case TCP_TW_RST:
1755 		goto no_tcp_socket;
1756 	case TCP_TW_SUCCESS:;
1757 	}
1758 	goto discard_it;
1759 }
1760 
tcp_v4_get_peer(struct sock * sk,bool * release_it)1761 struct inet_peer *tcp_v4_get_peer(struct sock *sk, bool *release_it)
1762 {
1763 	struct rtable *rt = (struct rtable *) __sk_dst_get(sk);
1764 	struct inet_sock *inet = inet_sk(sk);
1765 	struct inet_peer *peer;
1766 
1767 	if (!rt || rt->rt_dst != inet->inet_daddr) {
1768 		peer = inet_getpeer_v4(inet->inet_daddr, 1);
1769 		*release_it = true;
1770 	} else {
1771 		if (!rt->peer)
1772 			rt_bind_peer(rt, 1);
1773 		peer = rt->peer;
1774 		*release_it = false;
1775 	}
1776 
1777 	return peer;
1778 }
1779 EXPORT_SYMBOL(tcp_v4_get_peer);
1780 
tcp_v4_tw_get_peer(struct sock * sk)1781 void *tcp_v4_tw_get_peer(struct sock *sk)
1782 {
1783 	struct inet_timewait_sock *tw = inet_twsk(sk);
1784 
1785 	return inet_getpeer_v4(tw->tw_daddr, 1);
1786 }
1787 EXPORT_SYMBOL(tcp_v4_tw_get_peer);
1788 
1789 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1790 	.twsk_obj_size	= sizeof(struct tcp_timewait_sock),
1791 	.twsk_unique	= tcp_twsk_unique,
1792 	.twsk_destructor= tcp_twsk_destructor,
1793 	.twsk_getpeer	= tcp_v4_tw_get_peer,
1794 };
1795 
1796 const struct inet_connection_sock_af_ops ipv4_specific = {
1797 	.queue_xmit	   = ip_queue_xmit,
1798 	.send_check	   = tcp_v4_send_check,
1799 	.rebuild_header	   = inet_sk_rebuild_header,
1800 	.conn_request	   = tcp_v4_conn_request,
1801 	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
1802 	.get_peer	   = tcp_v4_get_peer,
1803 	.net_header_len	   = sizeof(struct iphdr),
1804 	.setsockopt	   = ip_setsockopt,
1805 	.getsockopt	   = ip_getsockopt,
1806 	.addr2sockaddr	   = inet_csk_addr2sockaddr,
1807 	.sockaddr_len	   = sizeof(struct sockaddr_in),
1808 	.bind_conflict	   = inet_csk_bind_conflict,
1809 #ifdef CONFIG_COMPAT
1810 	.compat_setsockopt = compat_ip_setsockopt,
1811 	.compat_getsockopt = compat_ip_getsockopt,
1812 #endif
1813 };
1814 EXPORT_SYMBOL(ipv4_specific);
1815 
1816 #ifdef CONFIG_TCP_MD5SIG
1817 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1818 	.md5_lookup		= tcp_v4_md5_lookup,
1819 	.calc_md5_hash		= tcp_v4_md5_hash_skb,
1820 	.md5_add		= tcp_v4_md5_add_func,
1821 	.md5_parse		= tcp_v4_parse_md5_keys,
1822 };
1823 #endif
1824 
1825 /* NOTE: A lot of things set to zero explicitly by call to
1826  *       sk_alloc() so need not be done here.
1827  */
tcp_v4_init_sock(struct sock * sk)1828 static int tcp_v4_init_sock(struct sock *sk)
1829 {
1830 	struct inet_connection_sock *icsk = inet_csk(sk);
1831 	struct tcp_sock *tp = tcp_sk(sk);
1832 
1833 	skb_queue_head_init(&tp->out_of_order_queue);
1834 	tcp_init_xmit_timers(sk);
1835 	tcp_prequeue_init(tp);
1836 
1837 	icsk->icsk_rto = TCP_TIMEOUT_INIT;
1838 	tp->mdev = TCP_TIMEOUT_INIT;
1839 
1840 	/* So many TCP implementations out there (incorrectly) count the
1841 	 * initial SYN frame in their delayed-ACK and congestion control
1842 	 * algorithms that we must have the following bandaid to talk
1843 	 * efficiently to them.  -DaveM
1844 	 */
1845 	tp->snd_cwnd = 2;
1846 
1847 	/* See draft-stevens-tcpca-spec-01 for discussion of the
1848 	 * initialization of these values.
1849 	 */
1850 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1851 	tp->snd_cwnd_clamp = ~0;
1852 	tp->mss_cache = TCP_MSS_DEFAULT;
1853 
1854 	tp->reordering = sysctl_tcp_reordering;
1855 	icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1856 
1857 	sk->sk_state = TCP_CLOSE;
1858 
1859 	sk->sk_write_space = sk_stream_write_space;
1860 	sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1861 
1862 	icsk->icsk_af_ops = &ipv4_specific;
1863 	icsk->icsk_sync_mss = tcp_sync_mss;
1864 #ifdef CONFIG_TCP_MD5SIG
1865 	tp->af_specific = &tcp_sock_ipv4_specific;
1866 #endif
1867 
1868 	/* TCP Cookie Transactions */
1869 	if (sysctl_tcp_cookie_size > 0) {
1870 		/* Default, cookies without s_data_payload. */
1871 		tp->cookie_values =
1872 			kzalloc(sizeof(*tp->cookie_values),
1873 				sk->sk_allocation);
1874 		if (tp->cookie_values != NULL)
1875 			kref_init(&tp->cookie_values->kref);
1876 	}
1877 	/* Presumed zeroed, in order of appearance:
1878 	 *	cookie_in_always, cookie_out_never,
1879 	 *	s_data_constant, s_data_in, s_data_out
1880 	 */
1881 	sk->sk_sndbuf = sysctl_tcp_wmem[1];
1882 	sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1883 
1884 	local_bh_disable();
1885 	percpu_counter_inc(&tcp_sockets_allocated);
1886 	local_bh_enable();
1887 
1888 	return 0;
1889 }
1890 
tcp_v4_destroy_sock(struct sock * sk)1891 void tcp_v4_destroy_sock(struct sock *sk)
1892 {
1893 	struct tcp_sock *tp = tcp_sk(sk);
1894 
1895 	tcp_clear_xmit_timers(sk);
1896 
1897 	tcp_cleanup_congestion_control(sk);
1898 
1899 	/* Cleanup up the write buffer. */
1900 	tcp_write_queue_purge(sk);
1901 
1902 	/* Cleans up our, hopefully empty, out_of_order_queue. */
1903 	__skb_queue_purge(&tp->out_of_order_queue);
1904 
1905 #ifdef CONFIG_TCP_MD5SIG
1906 	/* Clean up the MD5 key list, if any */
1907 	if (tp->md5sig_info) {
1908 		tcp_v4_clear_md5_list(sk);
1909 		kfree(tp->md5sig_info);
1910 		tp->md5sig_info = NULL;
1911 	}
1912 #endif
1913 
1914 #ifdef CONFIG_NET_DMA
1915 	/* Cleans up our sk_async_wait_queue */
1916 	__skb_queue_purge(&sk->sk_async_wait_queue);
1917 #endif
1918 
1919 	/* Clean prequeue, it must be empty really */
1920 	__skb_queue_purge(&tp->ucopy.prequeue);
1921 
1922 	/* Clean up a referenced TCP bind bucket. */
1923 	if (inet_csk(sk)->icsk_bind_hash)
1924 		inet_put_port(sk);
1925 
1926 	/*
1927 	 * If sendmsg cached page exists, toss it.
1928 	 */
1929 	if (sk->sk_sndmsg_page) {
1930 		__free_page(sk->sk_sndmsg_page);
1931 		sk->sk_sndmsg_page = NULL;
1932 	}
1933 
1934 	/* TCP Cookie Transactions */
1935 	if (tp->cookie_values != NULL) {
1936 		kref_put(&tp->cookie_values->kref,
1937 			 tcp_cookie_values_release);
1938 		tp->cookie_values = NULL;
1939 	}
1940 
1941 	percpu_counter_dec(&tcp_sockets_allocated);
1942 }
1943 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1944 
1945 #ifdef CONFIG_PROC_FS
1946 /* Proc filesystem TCP sock list dumping. */
1947 
tw_head(struct hlist_nulls_head * head)1948 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1949 {
1950 	return hlist_nulls_empty(head) ? NULL :
1951 		list_entry(head->first, struct inet_timewait_sock, tw_node);
1952 }
1953 
tw_next(struct inet_timewait_sock * tw)1954 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1955 {
1956 	return !is_a_nulls(tw->tw_node.next) ?
1957 		hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1958 }
1959 
1960 /*
1961  * Get next listener socket follow cur.  If cur is NULL, get first socket
1962  * starting from bucket given in st->bucket; when st->bucket is zero the
1963  * very first socket in the hash table is returned.
1964  */
listening_get_next(struct seq_file * seq,void * cur)1965 static void *listening_get_next(struct seq_file *seq, void *cur)
1966 {
1967 	struct inet_connection_sock *icsk;
1968 	struct hlist_nulls_node *node;
1969 	struct sock *sk = cur;
1970 	struct inet_listen_hashbucket *ilb;
1971 	struct tcp_iter_state *st = seq->private;
1972 	struct net *net = seq_file_net(seq);
1973 
1974 	if (!sk) {
1975 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
1976 		spin_lock_bh(&ilb->lock);
1977 		sk = sk_nulls_head(&ilb->head);
1978 		st->offset = 0;
1979 		goto get_sk;
1980 	}
1981 	ilb = &tcp_hashinfo.listening_hash[st->bucket];
1982 	++st->num;
1983 	++st->offset;
1984 
1985 	if (st->state == TCP_SEQ_STATE_OPENREQ) {
1986 		struct request_sock *req = cur;
1987 
1988 		icsk = inet_csk(st->syn_wait_sk);
1989 		req = req->dl_next;
1990 		while (1) {
1991 			while (req) {
1992 				if (req->rsk_ops->family == st->family) {
1993 					cur = req;
1994 					goto out;
1995 				}
1996 				req = req->dl_next;
1997 			}
1998 			if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1999 				break;
2000 get_req:
2001 			req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
2002 		}
2003 		sk	  = sk_nulls_next(st->syn_wait_sk);
2004 		st->state = TCP_SEQ_STATE_LISTENING;
2005 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2006 	} else {
2007 		icsk = inet_csk(sk);
2008 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2009 		if (reqsk_queue_len(&icsk->icsk_accept_queue))
2010 			goto start_req;
2011 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2012 		sk = sk_nulls_next(sk);
2013 	}
2014 get_sk:
2015 	sk_nulls_for_each_from(sk, node) {
2016 		if (!net_eq(sock_net(sk), net))
2017 			continue;
2018 		if (sk->sk_family == st->family) {
2019 			cur = sk;
2020 			goto out;
2021 		}
2022 		icsk = inet_csk(sk);
2023 		read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2024 		if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
2025 start_req:
2026 			st->uid		= sock_i_uid(sk);
2027 			st->syn_wait_sk = sk;
2028 			st->state	= TCP_SEQ_STATE_OPENREQ;
2029 			st->sbucket	= 0;
2030 			goto get_req;
2031 		}
2032 		read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2033 	}
2034 	spin_unlock_bh(&ilb->lock);
2035 	st->offset = 0;
2036 	if (++st->bucket < INET_LHTABLE_SIZE) {
2037 		ilb = &tcp_hashinfo.listening_hash[st->bucket];
2038 		spin_lock_bh(&ilb->lock);
2039 		sk = sk_nulls_head(&ilb->head);
2040 		goto get_sk;
2041 	}
2042 	cur = NULL;
2043 out:
2044 	return cur;
2045 }
2046 
listening_get_idx(struct seq_file * seq,loff_t * pos)2047 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
2048 {
2049 	struct tcp_iter_state *st = seq->private;
2050 	void *rc;
2051 
2052 	st->bucket = 0;
2053 	st->offset = 0;
2054 	rc = listening_get_next(seq, NULL);
2055 
2056 	while (rc && *pos) {
2057 		rc = listening_get_next(seq, rc);
2058 		--*pos;
2059 	}
2060 	return rc;
2061 }
2062 
empty_bucket(struct tcp_iter_state * st)2063 static inline int empty_bucket(struct tcp_iter_state *st)
2064 {
2065 	return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2066 		hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2067 }
2068 
2069 /*
2070  * Get first established socket starting from bucket given in st->bucket.
2071  * If st->bucket is zero, the very first socket in the hash is returned.
2072  */
established_get_first(struct seq_file * seq)2073 static void *established_get_first(struct seq_file *seq)
2074 {
2075 	struct tcp_iter_state *st = seq->private;
2076 	struct net *net = seq_file_net(seq);
2077 	void *rc = NULL;
2078 
2079 	st->offset = 0;
2080 	for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2081 		struct sock *sk;
2082 		struct hlist_nulls_node *node;
2083 		struct inet_timewait_sock *tw;
2084 		spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2085 
2086 		/* Lockless fast path for the common case of empty buckets */
2087 		if (empty_bucket(st))
2088 			continue;
2089 
2090 		spin_lock_bh(lock);
2091 		sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2092 			if (sk->sk_family != st->family ||
2093 			    !net_eq(sock_net(sk), net)) {
2094 				continue;
2095 			}
2096 			rc = sk;
2097 			goto out;
2098 		}
2099 		st->state = TCP_SEQ_STATE_TIME_WAIT;
2100 		inet_twsk_for_each(tw, node,
2101 				   &tcp_hashinfo.ehash[st->bucket].twchain) {
2102 			if (tw->tw_family != st->family ||
2103 			    !net_eq(twsk_net(tw), net)) {
2104 				continue;
2105 			}
2106 			rc = tw;
2107 			goto out;
2108 		}
2109 		spin_unlock_bh(lock);
2110 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2111 	}
2112 out:
2113 	return rc;
2114 }
2115 
established_get_next(struct seq_file * seq,void * cur)2116 static void *established_get_next(struct seq_file *seq, void *cur)
2117 {
2118 	struct sock *sk = cur;
2119 	struct inet_timewait_sock *tw;
2120 	struct hlist_nulls_node *node;
2121 	struct tcp_iter_state *st = seq->private;
2122 	struct net *net = seq_file_net(seq);
2123 
2124 	++st->num;
2125 	++st->offset;
2126 
2127 	if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2128 		tw = cur;
2129 		tw = tw_next(tw);
2130 get_tw:
2131 		while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2132 			tw = tw_next(tw);
2133 		}
2134 		if (tw) {
2135 			cur = tw;
2136 			goto out;
2137 		}
2138 		spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2139 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2140 
2141 		/* Look for next non empty bucket */
2142 		st->offset = 0;
2143 		while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2144 				empty_bucket(st))
2145 			;
2146 		if (st->bucket > tcp_hashinfo.ehash_mask)
2147 			return NULL;
2148 
2149 		spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2150 		sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2151 	} else
2152 		sk = sk_nulls_next(sk);
2153 
2154 	sk_nulls_for_each_from(sk, node) {
2155 		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2156 			goto found;
2157 	}
2158 
2159 	st->state = TCP_SEQ_STATE_TIME_WAIT;
2160 	tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2161 	goto get_tw;
2162 found:
2163 	cur = sk;
2164 out:
2165 	return cur;
2166 }
2167 
established_get_idx(struct seq_file * seq,loff_t pos)2168 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2169 {
2170 	struct tcp_iter_state *st = seq->private;
2171 	void *rc;
2172 
2173 	st->bucket = 0;
2174 	rc = established_get_first(seq);
2175 
2176 	while (rc && pos) {
2177 		rc = established_get_next(seq, rc);
2178 		--pos;
2179 	}
2180 	return rc;
2181 }
2182 
tcp_get_idx(struct seq_file * seq,loff_t pos)2183 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2184 {
2185 	void *rc;
2186 	struct tcp_iter_state *st = seq->private;
2187 
2188 	st->state = TCP_SEQ_STATE_LISTENING;
2189 	rc	  = listening_get_idx(seq, &pos);
2190 
2191 	if (!rc) {
2192 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2193 		rc	  = established_get_idx(seq, pos);
2194 	}
2195 
2196 	return rc;
2197 }
2198 
tcp_seek_last_pos(struct seq_file * seq)2199 static void *tcp_seek_last_pos(struct seq_file *seq)
2200 {
2201 	struct tcp_iter_state *st = seq->private;
2202 	int offset = st->offset;
2203 	int orig_num = st->num;
2204 	void *rc = NULL;
2205 
2206 	switch (st->state) {
2207 	case TCP_SEQ_STATE_OPENREQ:
2208 	case TCP_SEQ_STATE_LISTENING:
2209 		if (st->bucket >= INET_LHTABLE_SIZE)
2210 			break;
2211 		st->state = TCP_SEQ_STATE_LISTENING;
2212 		rc = listening_get_next(seq, NULL);
2213 		while (offset-- && rc)
2214 			rc = listening_get_next(seq, rc);
2215 		if (rc)
2216 			break;
2217 		st->bucket = 0;
2218 		/* Fallthrough */
2219 	case TCP_SEQ_STATE_ESTABLISHED:
2220 	case TCP_SEQ_STATE_TIME_WAIT:
2221 		st->state = TCP_SEQ_STATE_ESTABLISHED;
2222 		if (st->bucket > tcp_hashinfo.ehash_mask)
2223 			break;
2224 		rc = established_get_first(seq);
2225 		while (offset-- && rc)
2226 			rc = established_get_next(seq, rc);
2227 	}
2228 
2229 	st->num = orig_num;
2230 
2231 	return rc;
2232 }
2233 
tcp_seq_start(struct seq_file * seq,loff_t * pos)2234 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2235 {
2236 	struct tcp_iter_state *st = seq->private;
2237 	void *rc;
2238 
2239 	if (*pos && *pos == st->last_pos) {
2240 		rc = tcp_seek_last_pos(seq);
2241 		if (rc)
2242 			goto out;
2243 	}
2244 
2245 	st->state = TCP_SEQ_STATE_LISTENING;
2246 	st->num = 0;
2247 	st->bucket = 0;
2248 	st->offset = 0;
2249 	rc = *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2250 
2251 out:
2252 	st->last_pos = *pos;
2253 	return rc;
2254 }
2255 
tcp_seq_next(struct seq_file * seq,void * v,loff_t * pos)2256 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2257 {
2258 	struct tcp_iter_state *st = seq->private;
2259 	void *rc = NULL;
2260 
2261 	if (v == SEQ_START_TOKEN) {
2262 		rc = tcp_get_idx(seq, 0);
2263 		goto out;
2264 	}
2265 
2266 	switch (st->state) {
2267 	case TCP_SEQ_STATE_OPENREQ:
2268 	case TCP_SEQ_STATE_LISTENING:
2269 		rc = listening_get_next(seq, v);
2270 		if (!rc) {
2271 			st->state = TCP_SEQ_STATE_ESTABLISHED;
2272 			st->bucket = 0;
2273 			st->offset = 0;
2274 			rc	  = established_get_first(seq);
2275 		}
2276 		break;
2277 	case TCP_SEQ_STATE_ESTABLISHED:
2278 	case TCP_SEQ_STATE_TIME_WAIT:
2279 		rc = established_get_next(seq, v);
2280 		break;
2281 	}
2282 out:
2283 	++*pos;
2284 	st->last_pos = *pos;
2285 	return rc;
2286 }
2287 
tcp_seq_stop(struct seq_file * seq,void * v)2288 static void tcp_seq_stop(struct seq_file *seq, void *v)
2289 {
2290 	struct tcp_iter_state *st = seq->private;
2291 
2292 	switch (st->state) {
2293 	case TCP_SEQ_STATE_OPENREQ:
2294 		if (v) {
2295 			struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2296 			read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2297 		}
2298 	case TCP_SEQ_STATE_LISTENING:
2299 		if (v != SEQ_START_TOKEN)
2300 			spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2301 		break;
2302 	case TCP_SEQ_STATE_TIME_WAIT:
2303 	case TCP_SEQ_STATE_ESTABLISHED:
2304 		if (v)
2305 			spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2306 		break;
2307 	}
2308 }
2309 
tcp_seq_open(struct inode * inode,struct file * file)2310 static int tcp_seq_open(struct inode *inode, struct file *file)
2311 {
2312 	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2313 	struct tcp_iter_state *s;
2314 	int err;
2315 
2316 	err = seq_open_net(inode, file, &afinfo->seq_ops,
2317 			  sizeof(struct tcp_iter_state));
2318 	if (err < 0)
2319 		return err;
2320 
2321 	s = ((struct seq_file *)file->private_data)->private;
2322 	s->family		= afinfo->family;
2323 	s->last_pos 		= 0;
2324 	return 0;
2325 }
2326 
tcp_proc_register(struct net * net,struct tcp_seq_afinfo * afinfo)2327 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2328 {
2329 	int rc = 0;
2330 	struct proc_dir_entry *p;
2331 
2332 	afinfo->seq_fops.open		= tcp_seq_open;
2333 	afinfo->seq_fops.read		= seq_read;
2334 	afinfo->seq_fops.llseek		= seq_lseek;
2335 	afinfo->seq_fops.release	= seq_release_net;
2336 
2337 	afinfo->seq_ops.start		= tcp_seq_start;
2338 	afinfo->seq_ops.next		= tcp_seq_next;
2339 	afinfo->seq_ops.stop		= tcp_seq_stop;
2340 
2341 	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2342 			     &afinfo->seq_fops, afinfo);
2343 	if (!p)
2344 		rc = -ENOMEM;
2345 	return rc;
2346 }
2347 EXPORT_SYMBOL(tcp_proc_register);
2348 
tcp_proc_unregister(struct net * net,struct tcp_seq_afinfo * afinfo)2349 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2350 {
2351 	proc_net_remove(net, afinfo->name);
2352 }
2353 EXPORT_SYMBOL(tcp_proc_unregister);
2354 
get_openreq4(struct sock * sk,struct request_sock * req,struct seq_file * f,int i,int uid,int * len)2355 static void get_openreq4(struct sock *sk, struct request_sock *req,
2356 			 struct seq_file *f, int i, int uid, int *len)
2357 {
2358 	const struct inet_request_sock *ireq = inet_rsk(req);
2359 	int ttd = req->expires - jiffies;
2360 
2361 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2362 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2363 		i,
2364 		ireq->loc_addr,
2365 		ntohs(inet_sk(sk)->inet_sport),
2366 		ireq->rmt_addr,
2367 		ntohs(ireq->rmt_port),
2368 		TCP_SYN_RECV,
2369 		0, 0, /* could print option size, but that is af dependent. */
2370 		1,    /* timers active (only the expire timer) */
2371 		jiffies_to_clock_t(ttd),
2372 		req->retrans,
2373 		uid,
2374 		0,  /* non standard timer */
2375 		0, /* open_requests have no inode */
2376 		atomic_read(&sk->sk_refcnt),
2377 		req,
2378 		len);
2379 }
2380 
get_tcp4_sock(struct sock * sk,struct seq_file * f,int i,int * len)2381 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2382 {
2383 	int timer_active;
2384 	unsigned long timer_expires;
2385 	struct tcp_sock *tp = tcp_sk(sk);
2386 	const struct inet_connection_sock *icsk = inet_csk(sk);
2387 	struct inet_sock *inet = inet_sk(sk);
2388 	__be32 dest = inet->inet_daddr;
2389 	__be32 src = inet->inet_rcv_saddr;
2390 	__u16 destp = ntohs(inet->inet_dport);
2391 	__u16 srcp = ntohs(inet->inet_sport);
2392 	int rx_queue;
2393 
2394 	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2395 		timer_active	= 1;
2396 		timer_expires	= icsk->icsk_timeout;
2397 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2398 		timer_active	= 4;
2399 		timer_expires	= icsk->icsk_timeout;
2400 	} else if (timer_pending(&sk->sk_timer)) {
2401 		timer_active	= 2;
2402 		timer_expires	= sk->sk_timer.expires;
2403 	} else {
2404 		timer_active	= 0;
2405 		timer_expires = jiffies;
2406 	}
2407 
2408 	if (sk->sk_state == TCP_LISTEN)
2409 		rx_queue = sk->sk_ack_backlog;
2410 	else
2411 		/*
2412 		 * because we dont lock socket, we might find a transient negative value
2413 		 */
2414 		rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
2415 
2416 	seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2417 			"%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2418 		i, src, srcp, dest, destp, sk->sk_state,
2419 		tp->write_seq - tp->snd_una,
2420 		rx_queue,
2421 		timer_active,
2422 		jiffies_to_clock_t(timer_expires - jiffies),
2423 		icsk->icsk_retransmits,
2424 		sock_i_uid(sk),
2425 		icsk->icsk_probes_out,
2426 		sock_i_ino(sk),
2427 		atomic_read(&sk->sk_refcnt), sk,
2428 		jiffies_to_clock_t(icsk->icsk_rto),
2429 		jiffies_to_clock_t(icsk->icsk_ack.ato),
2430 		(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2431 		tp->snd_cwnd,
2432 		tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2433 		len);
2434 }
2435 
get_timewait4_sock(struct inet_timewait_sock * tw,struct seq_file * f,int i,int * len)2436 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2437 			       struct seq_file *f, int i, int *len)
2438 {
2439 	__be32 dest, src;
2440 	__u16 destp, srcp;
2441 	int ttd = tw->tw_ttd - jiffies;
2442 
2443 	if (ttd < 0)
2444 		ttd = 0;
2445 
2446 	dest  = tw->tw_daddr;
2447 	src   = tw->tw_rcv_saddr;
2448 	destp = ntohs(tw->tw_dport);
2449 	srcp  = ntohs(tw->tw_sport);
2450 
2451 	seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2452 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2453 		i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2454 		3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2455 		atomic_read(&tw->tw_refcnt), tw, len);
2456 }
2457 
2458 #define TMPSZ 150
2459 
tcp4_seq_show(struct seq_file * seq,void * v)2460 static int tcp4_seq_show(struct seq_file *seq, void *v)
2461 {
2462 	struct tcp_iter_state *st;
2463 	int len;
2464 
2465 	if (v == SEQ_START_TOKEN) {
2466 		seq_printf(seq, "%-*s\n", TMPSZ - 1,
2467 			   "  sl  local_address rem_address   st tx_queue "
2468 			   "rx_queue tr tm->when retrnsmt   uid  timeout "
2469 			   "inode");
2470 		goto out;
2471 	}
2472 	st = seq->private;
2473 
2474 	switch (st->state) {
2475 	case TCP_SEQ_STATE_LISTENING:
2476 	case TCP_SEQ_STATE_ESTABLISHED:
2477 		get_tcp4_sock(v, seq, st->num, &len);
2478 		break;
2479 	case TCP_SEQ_STATE_OPENREQ:
2480 		get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2481 		break;
2482 	case TCP_SEQ_STATE_TIME_WAIT:
2483 		get_timewait4_sock(v, seq, st->num, &len);
2484 		break;
2485 	}
2486 	seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2487 out:
2488 	return 0;
2489 }
2490 
2491 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2492 	.name		= "tcp",
2493 	.family		= AF_INET,
2494 	.seq_fops	= {
2495 		.owner		= THIS_MODULE,
2496 	},
2497 	.seq_ops	= {
2498 		.show		= tcp4_seq_show,
2499 	},
2500 };
2501 
tcp4_proc_init_net(struct net * net)2502 static int __net_init tcp4_proc_init_net(struct net *net)
2503 {
2504 	return tcp_proc_register(net, &tcp4_seq_afinfo);
2505 }
2506 
tcp4_proc_exit_net(struct net * net)2507 static void __net_exit tcp4_proc_exit_net(struct net *net)
2508 {
2509 	tcp_proc_unregister(net, &tcp4_seq_afinfo);
2510 }
2511 
2512 static struct pernet_operations tcp4_net_ops = {
2513 	.init = tcp4_proc_init_net,
2514 	.exit = tcp4_proc_exit_net,
2515 };
2516 
tcp4_proc_init(void)2517 int __init tcp4_proc_init(void)
2518 {
2519 	return register_pernet_subsys(&tcp4_net_ops);
2520 }
2521 
tcp4_proc_exit(void)2522 void tcp4_proc_exit(void)
2523 {
2524 	unregister_pernet_subsys(&tcp4_net_ops);
2525 }
2526 #endif /* CONFIG_PROC_FS */
2527 
tcp4_gro_receive(struct sk_buff ** head,struct sk_buff * skb)2528 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2529 {
2530 	struct iphdr *iph = skb_gro_network_header(skb);
2531 
2532 	switch (skb->ip_summed) {
2533 	case CHECKSUM_COMPLETE:
2534 		if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2535 				  skb->csum)) {
2536 			skb->ip_summed = CHECKSUM_UNNECESSARY;
2537 			break;
2538 		}
2539 
2540 		/* fall through */
2541 	case CHECKSUM_NONE:
2542 		NAPI_GRO_CB(skb)->flush = 1;
2543 		return NULL;
2544 	}
2545 
2546 	return tcp_gro_receive(head, skb);
2547 }
2548 
tcp4_gro_complete(struct sk_buff * skb)2549 int tcp4_gro_complete(struct sk_buff *skb)
2550 {
2551 	struct iphdr *iph = ip_hdr(skb);
2552 	struct tcphdr *th = tcp_hdr(skb);
2553 
2554 	th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2555 				  iph->saddr, iph->daddr, 0);
2556 	skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2557 
2558 	return tcp_gro_complete(skb);
2559 }
2560 
2561 struct proto tcp_prot = {
2562 	.name			= "TCP",
2563 	.owner			= THIS_MODULE,
2564 	.close			= tcp_close,
2565 	.connect		= tcp_v4_connect,
2566 	.disconnect		= tcp_disconnect,
2567 	.accept			= inet_csk_accept,
2568 	.ioctl			= tcp_ioctl,
2569 	.init			= tcp_v4_init_sock,
2570 	.destroy		= tcp_v4_destroy_sock,
2571 	.shutdown		= tcp_shutdown,
2572 	.setsockopt		= tcp_setsockopt,
2573 	.getsockopt		= tcp_getsockopt,
2574 	.recvmsg		= tcp_recvmsg,
2575 	.sendmsg		= tcp_sendmsg,
2576 	.sendpage		= tcp_sendpage,
2577 	.backlog_rcv		= tcp_v4_do_rcv,
2578 	.hash			= inet_hash,
2579 	.unhash			= inet_unhash,
2580 	.get_port		= inet_csk_get_port,
2581 	.enter_memory_pressure	= tcp_enter_memory_pressure,
2582 	.sockets_allocated	= &tcp_sockets_allocated,
2583 	.orphan_count		= &tcp_orphan_count,
2584 	.memory_allocated	= &tcp_memory_allocated,
2585 	.memory_pressure	= &tcp_memory_pressure,
2586 	.sysctl_mem		= sysctl_tcp_mem,
2587 	.sysctl_wmem		= sysctl_tcp_wmem,
2588 	.sysctl_rmem		= sysctl_tcp_rmem,
2589 	.max_header		= MAX_TCP_HEADER,
2590 	.obj_size		= sizeof(struct tcp_sock),
2591 	.slab_flags		= SLAB_DESTROY_BY_RCU,
2592 	.twsk_prot		= &tcp_timewait_sock_ops,
2593 	.rsk_prot		= &tcp_request_sock_ops,
2594 	.h.hashinfo		= &tcp_hashinfo,
2595 	.no_autobind		= true,
2596 #ifdef CONFIG_COMPAT
2597 	.compat_setsockopt	= compat_tcp_setsockopt,
2598 	.compat_getsockopt	= compat_tcp_getsockopt,
2599 #endif
2600 };
2601 EXPORT_SYMBOL(tcp_prot);
2602 
2603 
tcp_sk_init(struct net * net)2604 static int __net_init tcp_sk_init(struct net *net)
2605 {
2606 	return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2607 				    PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2608 }
2609 
tcp_sk_exit(struct net * net)2610 static void __net_exit tcp_sk_exit(struct net *net)
2611 {
2612 	inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2613 }
2614 
tcp_sk_exit_batch(struct list_head * net_exit_list)2615 static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
2616 {
2617 	inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
2618 }
2619 
2620 static struct pernet_operations __net_initdata tcp_sk_ops = {
2621        .init	   = tcp_sk_init,
2622        .exit	   = tcp_sk_exit,
2623        .exit_batch = tcp_sk_exit_batch,
2624 };
2625 
tcp_v4_init(void)2626 void __init tcp_v4_init(void)
2627 {
2628 	inet_hashinfo_init(&tcp_hashinfo);
2629 	if (register_pernet_subsys(&tcp_sk_ops))
2630 		panic("Failed to create the TCP control socket.\n");
2631 }
2632