1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp_ipv4.c,v 1.237.2.1 2002/01/15 08:49:49 davem Exp $
9  *
10  *		IPv4 specific functions
11  *
12  *
13  *		code split from:
14  *		linux/ipv4/tcp.c
15  *		linux/ipv4/tcp_input.c
16  *		linux/ipv4/tcp_output.c
17  *
18  *		See tcp.c for author information
19  *
20  *	This program is free software; you can redistribute it and/or
21  *      modify it under the terms of the GNU General Public License
22  *      as published by the Free Software Foundation; either version
23  *      2 of the License, or (at your option) any later version.
24  */
25 
26 /*
27  * Changes:
28  *		David S. Miller	:	New socket lookup architecture.
29  *					This code is dedicated to John Dyson.
30  *		David S. Miller :	Change semantics of established hash,
31  *					half is devoted to TIME_WAIT sockets
32  *					and the rest go in the other half.
33  *		Andi Kleen :		Add support for syncookies and fixed
34  *					some bugs: ip options weren't passed to
35  *					the TCP layer, missed a check for an ACK bit.
36  *		Andi Kleen :		Implemented fast path mtu discovery.
37  *	     				Fixed many serious bugs in the
38  *					open_request handling and moved
39  *					most of it into the af independent code.
40  *					Added tail drop and some other bugfixes.
41  *					Added new listen sematics.
42  *		Mike McLagan	:	Routing by source
43  *	Juan Jose Ciarlante:		ip_dynaddr bits
44  *		Andi Kleen:		various fixes.
45  *	Vitaly E. Lavrov	:	Transparent proxy revived after year coma.
46  *	Andi Kleen		:	Fix new listen.
47  *	Andi Kleen		:	Fix accept error reporting.
48  *	YOSHIFUJI Hideaki @USAGI and:	Support IPV6_V6ONLY socket option, which
49  *	Alexey Kuznetsov		allow both IPv4 and IPv6 sockets to bind
50  *					a single port at the same time.
51  */
52 
53 #include <linux/config.h>
54 
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61 
62 #include <net/icmp.h>
63 #include <net/tcp.h>
64 #include <net/ipv6.h>
65 #include <net/inet_common.h>
66 
67 #include <linux/inet.h>
68 #include <linux/stddef.h>
69 #include <linux/ipsec.h>
70 
71 extern int sysctl_ip_dynaddr;
72 extern int sysctl_ip_default_ttl;
73 int sysctl_tcp_tw_reuse = 0;
74 int sysctl_tcp_low_latency = 0;
75 
76 /* Check TCP sequence numbers in ICMP packets. */
77 #define ICMP_MIN_LENGTH 8
78 
79 /* Socket used for sending RSTs */
80 static struct inode tcp_inode;
81 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
82 
83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
84 		       struct sk_buff *skb);
85 
86 /*
87  * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
88  */
89 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
90 	__tcp_ehash:          NULL,
91 	__tcp_bhash:          NULL,
92 	__tcp_bhash_size:     0,
93 	__tcp_ehash_size:     0,
94 	__tcp_listening_hash: { NULL, },
95 	__tcp_lhash_lock:     RW_LOCK_UNLOCKED,
96 	__tcp_lhash_users:    ATOMIC_INIT(0),
97 	__tcp_lhash_wait:
98 	  __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
99 	__tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
100 };
101 
102 /*
103  * This array holds the first and last local port number.
104  * For high-usage systems, use sysctl to change this to
105  * 32768-61000
106  */
107 int sysctl_local_port_range[2] = { 1024, 4999 };
108 int tcp_port_rover = (1024 - 1);
109 
tcp_hashfn(__u32 laddr,__u16 lport,__u32 faddr,__u16 fport)110 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
111 				 __u32 faddr, __u16 fport)
112 {
113 	int h = ((laddr ^ lport) ^ (faddr ^ fport));
114 	h ^= h>>16;
115 	h ^= h>>8;
116 	return h & (tcp_ehash_size - 1);
117 }
118 
tcp_sk_hashfn(struct sock * sk)119 static __inline__ int tcp_sk_hashfn(struct sock *sk)
120 {
121 	__u32 laddr = sk->rcv_saddr;
122 	__u16 lport = sk->num;
123 	__u32 faddr = sk->daddr;
124 	__u16 fport = sk->dport;
125 
126 	return tcp_hashfn(laddr, lport, faddr, fport);
127 }
128 
129 /* Allocate and initialize a new TCP local port bind bucket.
130  * The bindhash mutex for snum's hash chain must be held here.
131  */
tcp_bucket_create(struct tcp_bind_hashbucket * head,unsigned short snum)132 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 					  unsigned short snum)
134 {
135 	struct tcp_bind_bucket *tb;
136 
137 	tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
138 	if(tb != NULL) {
139 		tb->port = snum;
140 		tb->fastreuse = 0;
141 		tb->owners = NULL;
142 		if((tb->next = head->chain) != NULL)
143 			tb->next->pprev = &tb->next;
144 		head->chain = tb;
145 		tb->pprev = &head->chain;
146 	}
147 	return tb;
148 }
149 
150 /* Caller must disable local BH processing. */
__tcp_inherit_port(struct sock * sk,struct sock * child)151 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
152 {
153 	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
154 	struct tcp_bind_bucket *tb;
155 
156 	spin_lock(&head->lock);
157 	tb = (struct tcp_bind_bucket *)sk->prev;
158 	if ((child->bind_next = tb->owners) != NULL)
159 		tb->owners->bind_pprev = &child->bind_next;
160 	tb->owners = child;
161 	child->bind_pprev = &tb->owners;
162 	child->prev = (struct sock *) tb;
163 	spin_unlock(&head->lock);
164 }
165 
tcp_inherit_port(struct sock * sk,struct sock * child)166 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
167 {
168 	local_bh_disable();
169 	__tcp_inherit_port(sk, child);
170 	local_bh_enable();
171 }
172 
tcp_bind_hash(struct sock * sk,struct tcp_bind_bucket * tb,unsigned short snum)173 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
174 {
175 	sk->num = snum;
176 	if ((sk->bind_next = tb->owners) != NULL)
177 		tb->owners->bind_pprev = &sk->bind_next;
178 	tb->owners = sk;
179 	sk->bind_pprev = &tb->owners;
180 	sk->prev = (struct sock *) tb;
181 }
182 
tcp_bind_conflict(struct sock * sk,struct tcp_bind_bucket * tb)183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 {
185 	struct sock *sk2 = tb->owners;
186 	int sk_reuse = sk->reuse;
187 
188 	for( ; sk2 != NULL; sk2 = sk2->bind_next) {
189 		if (sk != sk2 &&
190 		    sk2->reuse <= 1 &&
191 		    !ipv6_only_sock(sk2) &&
192 		    (!sk->bound_dev_if ||
193 		     !sk2->bound_dev_if ||
194 		     sk->bound_dev_if == sk2->bound_dev_if)) {
195 			if (!sk_reuse	||
196 			    !sk2->reuse	||
197 			    sk2->state == TCP_LISTEN) {
198 				if (!sk2->rcv_saddr	||
199 				    !sk->rcv_saddr	||
200 				    (sk2->rcv_saddr == sk->rcv_saddr))
201 					break;
202 			}
203 		}
204 	}
205 	return sk2 != NULL;
206 }
207 
208 /* Obtain a reference to a local port for the given sock,
209  * if snum is zero it means select any available local port.
210  */
tcp_v4_get_port(struct sock * sk,unsigned short snum)211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 {
213 	struct tcp_bind_hashbucket *head;
214 	struct tcp_bind_bucket *tb;
215 	int ret;
216 
217 	local_bh_disable();
218 	if (snum == 0) {
219 		int low = sysctl_local_port_range[0];
220 		int high = sysctl_local_port_range[1];
221 		int remaining = (high - low) + 1;
222 		int rover;
223 
224 		spin_lock(&tcp_portalloc_lock);
225 		rover = tcp_port_rover;
226 		do {	rover++;
227 			if ((rover < low) || (rover > high))
228 				rover = low;
229 			head = &tcp_bhash[tcp_bhashfn(rover)];
230 			spin_lock(&head->lock);
231 			for (tb = head->chain; tb; tb = tb->next)
232 				if (tb->port == rover)
233 					goto next;
234 			break;
235 		next:
236 			spin_unlock(&head->lock);
237 		} while (--remaining > 0);
238 		tcp_port_rover = rover;
239 		spin_unlock(&tcp_portalloc_lock);
240 
241 		/* Exhausted local port range during search? */
242 		ret = 1;
243 		if (remaining <= 0)
244 			goto fail;
245 
246 		/* OK, here is the one we will use.  HEAD is
247 		 * non-NULL and we hold it's mutex.
248 		 */
249 		snum = rover;
250 		tb = NULL;
251 	} else {
252 		head = &tcp_bhash[tcp_bhashfn(snum)];
253 		spin_lock(&head->lock);
254 		for (tb = head->chain; tb != NULL; tb = tb->next)
255 			if (tb->port == snum)
256 				break;
257 	}
258 	if (tb != NULL && tb->owners != NULL) {
259 		if (sk->reuse > 1)
260 			goto success;
261 		if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
262 			goto success;
263 		} else {
264 			ret = 1;
265 			if (tcp_bind_conflict(sk, tb))
266 				goto fail_unlock;
267 		}
268 	}
269 	ret = 1;
270 	if (tb == NULL &&
271 	    (tb = tcp_bucket_create(head, snum)) == NULL)
272 			goto fail_unlock;
273 	if (tb->owners == NULL) {
274 		if (sk->reuse && sk->state != TCP_LISTEN)
275 			tb->fastreuse = 1;
276 		else
277 			tb->fastreuse = 0;
278 	} else if (tb->fastreuse &&
279 		   ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
280 		tb->fastreuse = 0;
281 success:
282 	if (sk->prev == NULL)
283 		tcp_bind_hash(sk, tb, snum);
284 	BUG_TRAP(sk->prev == (struct sock *) tb);
285  	ret = 0;
286 
287 fail_unlock:
288 	spin_unlock(&head->lock);
289 fail:
290 	local_bh_enable();
291 	return ret;
292 }
293 
294 /* Get rid of any references to a local port held by the
295  * given sock.
296  */
__tcp_put_port(struct sock * sk)297 inline void __tcp_put_port(struct sock *sk)
298 {
299 	struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
300 	struct tcp_bind_bucket *tb;
301 
302 	spin_lock(&head->lock);
303 	tb = (struct tcp_bind_bucket *) sk->prev;
304 	if (sk->bind_next)
305 		sk->bind_next->bind_pprev = sk->bind_pprev;
306 	*(sk->bind_pprev) = sk->bind_next;
307 	sk->prev = NULL;
308 	sk->num = 0;
309 	if (tb->owners == NULL) {
310 		if (tb->next)
311 			tb->next->pprev = tb->pprev;
312 		*(tb->pprev) = tb->next;
313 		kmem_cache_free(tcp_bucket_cachep, tb);
314 	}
315 	spin_unlock(&head->lock);
316 }
317 
tcp_put_port(struct sock * sk)318 void tcp_put_port(struct sock *sk)
319 {
320 	local_bh_disable();
321 	__tcp_put_port(sk);
322 	local_bh_enable();
323 }
324 
325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
326  * Look, when several writers sleep and reader wakes them up, all but one
327  * immediately hit write lock and grab all the cpus. Exclusive sleep solves
328  * this, _but_ remember, it adds useless work on UP machines (wake up each
329  * exclusive lock release). It should be ifdefed really.
330  */
331 
tcp_listen_wlock(void)332 void tcp_listen_wlock(void)
333 {
334 	write_lock(&tcp_lhash_lock);
335 
336 	if (atomic_read(&tcp_lhash_users)) {
337 		DECLARE_WAITQUEUE(wait, current);
338 
339 		add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
340 		for (;;) {
341 			set_current_state(TASK_UNINTERRUPTIBLE);
342 			if (atomic_read(&tcp_lhash_users) == 0)
343 				break;
344 			write_unlock_bh(&tcp_lhash_lock);
345 			schedule();
346 			write_lock_bh(&tcp_lhash_lock);
347 		}
348 
349 		__set_current_state(TASK_RUNNING);
350 		remove_wait_queue(&tcp_lhash_wait, &wait);
351 	}
352 }
353 
__tcp_v4_hash(struct sock * sk,const int listen_possible)354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355 {
356 	struct sock **skp;
357 	rwlock_t *lock;
358 
359 	BUG_TRAP(sk->pprev==NULL);
360 	if(listen_possible && sk->state == TCP_LISTEN) {
361 		skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362 		lock = &tcp_lhash_lock;
363 		tcp_listen_wlock();
364 	} else {
365 		skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
366 		lock = &tcp_ehash[sk->hashent].lock;
367 		write_lock(lock);
368 	}
369 	if((sk->next = *skp) != NULL)
370 		(*skp)->pprev = &sk->next;
371 	*skp = sk;
372 	sk->pprev = skp;
373 	sock_prot_inc_use(sk->prot);
374 	write_unlock(lock);
375 	if (listen_possible && sk->state == TCP_LISTEN)
376 		wake_up(&tcp_lhash_wait);
377 }
378 
tcp_v4_hash(struct sock * sk)379 static void tcp_v4_hash(struct sock *sk)
380 {
381 	if (sk->state != TCP_CLOSE) {
382 		local_bh_disable();
383 		__tcp_v4_hash(sk, 1);
384 		local_bh_enable();
385 	}
386 }
387 
tcp_unhash(struct sock * sk)388 void tcp_unhash(struct sock *sk)
389 {
390 	rwlock_t *lock;
391 
392 	if (!sk->pprev)
393 		goto ende;
394 
395 	if (sk->state == TCP_LISTEN) {
396 		local_bh_disable();
397 		tcp_listen_wlock();
398 		lock = &tcp_lhash_lock;
399 	} else {
400 		struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
401 		lock = &head->lock;
402 		write_lock_bh(&head->lock);
403 	}
404 
405 	if(sk->pprev) {
406 		if(sk->next)
407 			sk->next->pprev = sk->pprev;
408 		*sk->pprev = sk->next;
409 		sk->pprev = NULL;
410 		sock_prot_dec_use(sk->prot);
411 	}
412 	write_unlock_bh(lock);
413 
414  ende:
415 	if (sk->state == TCP_LISTEN)
416 		wake_up(&tcp_lhash_wait);
417 }
418 
419 /* Don't inline this cruft.  Here are some nice properties to
420  * exploit here.  The BSD API does not allow a listening TCP
421  * to specify the remote port nor the remote address for the
422  * connection.  So always assume those are both wildcarded
423  * during the search since they can never be otherwise.
424  */
__tcp_v4_lookup_listener(struct sock * sk,u32 daddr,unsigned short hnum,int dif)425 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
426 {
427 	struct sock *result = NULL;
428 	int score, hiscore;
429 
430 	hiscore=-1;
431 	for(; sk; sk = sk->next) {
432 		if(sk->num == hnum && !ipv6_only_sock(sk)) {
433 			__u32 rcv_saddr = sk->rcv_saddr;
434 
435 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
436 			score = sk->family == PF_INET ? 1 : 0;
437 #else
438 			score = 1;
439 #endif
440 			if(rcv_saddr) {
441 				if (rcv_saddr != daddr)
442 					continue;
443 				score+=2;
444 			}
445 			if (sk->bound_dev_if) {
446 				if (sk->bound_dev_if != dif)
447 					continue;
448 				score+=2;
449 			}
450 			if (score == 5)
451 				return sk;
452 			if (score > hiscore) {
453 				hiscore = score;
454 				result = sk;
455 			}
456 		}
457 	}
458 	return result;
459 }
460 
461 /* Optimize the common listener case. */
tcp_v4_lookup_listener(u32 daddr,unsigned short hnum,int dif)462 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
463 {
464 	struct sock *sk;
465 
466 	read_lock(&tcp_lhash_lock);
467 	sk = tcp_listening_hash[tcp_lhashfn(hnum)];
468 	if (sk) {
469 		if (sk->num == hnum &&
470 		    sk->next == NULL &&
471 		    (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
472 		    (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
473 		    !sk->bound_dev_if)
474 			goto sherry_cache;
475 		sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
476 	}
477 	if (sk) {
478 sherry_cache:
479 		sock_hold(sk);
480 	}
481 	read_unlock(&tcp_lhash_lock);
482 	return sk;
483 }
484 
485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486  * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487  *
488  * Local BH must be disabled here.
489  */
490 
__tcp_v4_lookup_established(u32 saddr,u16 sport,u32 daddr,u16 hnum,int dif)491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492 						       u32 daddr, u16 hnum, int dif)
493 {
494 	struct tcp_ehash_bucket *head;
495 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
496 	__u32 ports = TCP_COMBINED_PORTS(sport, hnum);
497 	struct sock *sk;
498 	int hash;
499 
500 	/* Optimize here for direct hit, only listening connections can
501 	 * have wildcards anyways.
502 	 */
503 	hash = tcp_hashfn(daddr, hnum, saddr, sport);
504 	head = &tcp_ehash[hash];
505 	read_lock(&head->lock);
506 	for(sk = head->chain; sk; sk = sk->next) {
507 		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508 			goto hit; /* You sunk my battleship! */
509 	}
510 
511 	/* Must check for a TIME_WAIT'er before going to listener hash. */
512 	for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
513 		if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
514 			goto hit;
515 	read_unlock(&head->lock);
516 
517 	return NULL;
518 
519 hit:
520 	sock_hold(sk);
521 	read_unlock(&head->lock);
522 	return sk;
523 }
524 
__tcp_v4_lookup(u32 saddr,u16 sport,u32 daddr,u16 hnum,int dif)525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526 					   u32 daddr, u16 hnum, int dif)
527 {
528 	struct sock *sk;
529 
530 	sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
531 
532 	if (sk)
533 		return sk;
534 
535 	return tcp_v4_lookup_listener(daddr, hnum, dif);
536 }
537 
tcp_v4_lookup(u32 saddr,u16 sport,u32 daddr,u16 dport,int dif)538 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
539 {
540 	struct sock *sk;
541 
542 	local_bh_disable();
543 	sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
544 	local_bh_enable();
545 
546 	return sk;
547 }
548 
tcp_v4_init_sequence(struct sock * sk,struct sk_buff * skb)549 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
550 {
551 	return secure_tcp_sequence_number(skb->nh.iph->daddr,
552 					  skb->nh.iph->saddr,
553 					  skb->h.th->dest,
554 					  skb->h.th->source);
555 }
556 
557 /* called with local bh disabled */
__tcp_v4_check_established(struct sock * sk,__u16 lport,struct tcp_tw_bucket ** twp)558 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
559 				      struct tcp_tw_bucket **twp)
560 {
561 	u32 daddr = sk->rcv_saddr;
562 	u32 saddr = sk->daddr;
563 	int dif = sk->bound_dev_if;
564 	TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565 	__u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
566 	int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
567 	struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568 	struct sock *sk2, **skp;
569 	struct tcp_tw_bucket *tw;
570 
571 	write_lock(&head->lock);
572 
573 	/* Check TIME-WAIT sockets first. */
574 	for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
575 	    skp = &sk2->next) {
576 		tw = (struct tcp_tw_bucket*)sk2;
577 
578 		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579 			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
580 
581 			/* With PAWS, it is safe from the viewpoint
582 			   of data integrity. Even without PAWS it
583 			   is safe provided sequence spaces do not
584 			   overlap i.e. at data rates <= 80Mbit/sec.
585 
586 			   Actually, the idea is close to VJ's one,
587 			   only timestamp cache is held not per host,
588 			   but per port pair and TW bucket is used
589 			   as state holder.
590 
591 			   If TW bucket has been already destroyed we
592 			   fall back to VJ's scheme and use initial
593 			   timestamp retrieved from peer table.
594 			 */
595 			if (tw->ts_recent_stamp &&
596 			    (!twp || (sysctl_tcp_tw_reuse &&
597 				      xtime.tv_sec - tw->ts_recent_stamp > 1))) {
598 				if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
599 					tp->write_seq = 1;
600 				tp->ts_recent = tw->ts_recent;
601 				tp->ts_recent_stamp = tw->ts_recent_stamp;
602 				sock_hold(sk2);
603 				skp = &head->chain;
604 				goto unique;
605 			} else
606 				goto not_unique;
607 		}
608 	}
609 	tw = NULL;
610 
611 	/* And established part... */
612 	for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
613 		if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
614 			goto not_unique;
615 	}
616 
617 unique:
618 	/* Must record num and sport now. Otherwise we will see
619 	 * in hash table socket with a funny identity. */
620 	sk->num = lport;
621 	sk->sport = htons(lport);
622 	BUG_TRAP(sk->pprev==NULL);
623 	if ((sk->next = *skp) != NULL)
624 		(*skp)->pprev = &sk->next;
625 
626 	*skp = sk;
627 	sk->pprev = skp;
628 	sk->hashent = hash;
629 	sock_prot_inc_use(sk->prot);
630 	write_unlock(&head->lock);
631 
632 	if (twp) {
633 		*twp = tw;
634 		NET_INC_STATS_BH(TimeWaitRecycled);
635 	} else if (tw) {
636 		/* Silly. Should hash-dance instead... */
637 		tcp_tw_deschedule(tw);
638 		tcp_timewait_kill(tw);
639 		NET_INC_STATS_BH(TimeWaitRecycled);
640 
641 		tcp_tw_put(tw);
642 	}
643 
644 	return 0;
645 
646 not_unique:
647 	write_unlock(&head->lock);
648 	return -EADDRNOTAVAIL;
649 }
650 
651 /*
652  * Bind a port for a connect operation and hash it.
653  */
tcp_v4_hash_connect(struct sock * sk)654 static int tcp_v4_hash_connect(struct sock *sk)
655 {
656 	unsigned short snum = sk->num;
657 	struct tcp_bind_hashbucket *head;
658 	struct tcp_bind_bucket *tb;
659 
660 	if (snum == 0) {
661 		int rover;
662 		int low = sysctl_local_port_range[0];
663 		int high = sysctl_local_port_range[1];
664 		int remaining = (high - low) + 1;
665 		struct tcp_tw_bucket *tw = NULL;
666 
667 		local_bh_disable();
668 
669 		/* TODO. Actually it is not so bad idea to remove
670 		 * tcp_portalloc_lock before next submission to Linus.
671 		 * As soon as we touch this place at all it is time to think.
672 		 *
673 		 * Now it protects single _advisory_ variable tcp_port_rover,
674 		 * hence it is mostly useless.
675 		 * Code will work nicely if we just delete it, but
676 		 * I am afraid in contented case it will work not better or
677 		 * even worse: another cpu just will hit the same bucket
678 		 * and spin there.
679 		 * So some cpu salt could remove both contention and
680 		 * memory pingpong. Any ideas how to do this in a nice way?
681 		 */
682 		spin_lock(&tcp_portalloc_lock);
683 		rover = tcp_port_rover;
684 
685 		do {
686 			rover++;
687 			if ((rover < low) || (rover > high))
688 				rover = low;
689 			head = &tcp_bhash[tcp_bhashfn(rover)];
690 			spin_lock(&head->lock);
691 
692 			/* Does not bother with rcv_saddr checks,
693 			 * because the established check is already
694 			 * unique enough.
695 			 */
696 			for (tb = head->chain; tb; tb = tb->next) {
697 				if (tb->port == rover) {
698 					BUG_TRAP(tb->owners != NULL);
699 					if (tb->fastreuse >= 0)
700 						goto next_port;
701 					if (!__tcp_v4_check_established(sk, rover, &tw))
702 						goto ok;
703 					goto next_port;
704 				}
705 			}
706 
707 			tb = tcp_bucket_create(head, rover);
708 			if (!tb) {
709 				spin_unlock(&head->lock);
710 				break;
711 			}
712 			tb->fastreuse = -1;
713 			goto ok;
714 
715 		next_port:
716 			spin_unlock(&head->lock);
717 		} while (--remaining > 0);
718 		tcp_port_rover = rover;
719 		spin_unlock(&tcp_portalloc_lock);
720 
721 		local_bh_enable();
722 
723 		return -EADDRNOTAVAIL;
724 
725 	ok:
726 		/* All locks still held and bhs disabled */
727 		tcp_port_rover = rover;
728 		spin_unlock(&tcp_portalloc_lock);
729 
730 		tcp_bind_hash(sk, tb, rover);
731 		if (!sk->pprev) {
732 			sk->sport = htons(rover);
733 			__tcp_v4_hash(sk, 0);
734 		}
735 		spin_unlock(&head->lock);
736 
737 		if (tw) {
738 			tcp_tw_deschedule(tw);
739 			tcp_timewait_kill(tw);
740 			tcp_tw_put(tw);
741 		}
742 
743 		local_bh_enable();
744 		return 0;
745 	}
746 
747 	head  = &tcp_bhash[tcp_bhashfn(snum)];
748 	tb  = (struct tcp_bind_bucket *)sk->prev;
749 	spin_lock_bh(&head->lock);
750 	if (tb->owners == sk && sk->bind_next == NULL) {
751 		__tcp_v4_hash(sk, 0);
752 		spin_unlock_bh(&head->lock);
753 		return 0;
754 	} else {
755 		int ret;
756 		spin_unlock(&head->lock);
757 		/* No definite answer... Walk to established hash table */
758 		ret = __tcp_v4_check_established(sk, snum, NULL);
759 		local_bh_enable();
760 		return ret;
761 	}
762 }
763 
764 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)765 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
766 {
767 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768 	struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
769 	struct rtable *rt;
770 	u32 daddr, nexthop;
771 	int tmp;
772 	int err;
773 
774 	if (addr_len < sizeof(struct sockaddr_in))
775 		return(-EINVAL);
776 
777 	if (usin->sin_family != AF_INET)
778 		return(-EAFNOSUPPORT);
779 
780 	nexthop = daddr = usin->sin_addr.s_addr;
781 	if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
782 		if (daddr == 0)
783 			return -EINVAL;
784 		nexthop = sk->protinfo.af_inet.opt->faddr;
785 	}
786 
787 	tmp = ip_route_connect(&rt, nexthop, sk->saddr,
788 			       RT_CONN_FLAGS(sk), sk->bound_dev_if);
789 	if (tmp < 0)
790 		return tmp;
791 
792 	if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
793 		ip_rt_put(rt);
794 		return -ENETUNREACH;
795 	}
796 
797 	__sk_dst_set(sk, &rt->u.dst);
798 	sk->route_caps = rt->u.dst.dev->features;
799 
800 	if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
801 		daddr = rt->rt_dst;
802 
803 	if (!sk->saddr)
804 		sk->saddr = rt->rt_src;
805 	sk->rcv_saddr = sk->saddr;
806 
807 	if (tp->ts_recent_stamp && sk->daddr != daddr) {
808 		/* Reset inherited state */
809 		tp->ts_recent = 0;
810 		tp->ts_recent_stamp = 0;
811 		tp->write_seq = 0;
812 	}
813 
814 	if (sysctl_tcp_tw_recycle &&
815 	    !tp->ts_recent_stamp &&
816 	    rt->rt_dst == daddr) {
817 		struct inet_peer *peer = rt_get_peer(rt);
818 
819 		/* VJ's idea. We save last timestamp seen from
820 		 * the destination in peer table, when entering state TIME-WAIT
821 		 * and initialize ts_recent from it, when trying new connection.
822 		 */
823 
824 		if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
825 			tp->ts_recent_stamp = peer->tcp_ts_stamp;
826 			tp->ts_recent = peer->tcp_ts;
827 		}
828 	}
829 
830 	sk->dport = usin->sin_port;
831 	sk->daddr = daddr;
832 
833 	tp->ext_header_len = 0;
834 	if (sk->protinfo.af_inet.opt)
835 		tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
836 
837 	tp->mss_clamp = 536;
838 
839 	/* Socket identity is still unknown (sport may be zero).
840 	 * However we set state to SYN-SENT and not releasing socket
841 	 * lock select source port, enter ourselves into the hash tables and
842 	 * complete initalization after this.
843 	 */
844 	tcp_set_state(sk, TCP_SYN_SENT);
845 	err = tcp_v4_hash_connect(sk);
846 	if (err)
847 		goto failure;
848 
849 	if (!tp->write_seq)
850 		tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
851 							   sk->sport, usin->sin_port);
852 
853 	sk->protinfo.af_inet.id = tp->write_seq^jiffies;
854 
855 	err = tcp_connect(sk);
856 	if (err)
857 		goto failure;
858 
859 	return 0;
860 
861 failure:
862 	tcp_set_state(sk, TCP_CLOSE);
863 	__sk_dst_reset(sk);
864 	sk->route_caps = 0;
865 	sk->dport = 0;
866 	return err;
867 }
868 
tcp_v4_iif(struct sk_buff * skb)869 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
870 {
871 	return ((struct rtable*)skb->dst)->rt_iif;
872 }
873 
tcp_v4_synq_hash(u32 raddr,u16 rport,u32 rnd)874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
875 {
876 	return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
877 }
878 
tcp_v4_search_req(struct tcp_opt * tp,struct open_request *** prevp,__u16 rport,__u32 raddr,__u32 laddr)879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880 					      struct open_request ***prevp,
881 					      __u16 rport,
882 					      __u32 raddr, __u32 laddr)
883 {
884 	struct tcp_listen_opt *lopt = tp->listen_opt;
885 	struct open_request *req, **prev;
886 
887 	for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888 	     (req = *prev) != NULL;
889 	     prev = &req->dl_next) {
890 		if (req->rmt_port == rport &&
891 		    req->af.v4_req.rmt_addr == raddr &&
892 		    req->af.v4_req.loc_addr == laddr &&
893 		    TCP_INET_FAMILY(req->class->family)) {
894 			BUG_TRAP(req->sk == NULL);
895 			*prevp = prev;
896 			return req;
897 		}
898 	}
899 
900 	return NULL;
901 }
902 
tcp_v4_synq_add(struct sock * sk,struct open_request * req)903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
904 {
905 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906 	struct tcp_listen_opt *lopt = tp->listen_opt;
907 	u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
908 
909 	req->expires = jiffies + TCP_TIMEOUT_INIT;
910 	req->retrans = 0;
911 	req->sk = NULL;
912 	req->dl_next = lopt->syn_table[h];
913 
914 	write_lock(&tp->syn_wait_lock);
915 	lopt->syn_table[h] = req;
916 	write_unlock(&tp->syn_wait_lock);
917 
918 	tcp_synq_added(sk);
919 }
920 
921 
922 /*
923  * This routine does path mtu discovery as defined in RFC1191.
924  */
do_pmtu_discovery(struct sock * sk,struct iphdr * ip,unsigned mtu)925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
926 {
927 	struct dst_entry *dst;
928 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
929 
930 	/* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931 	 * send out by Linux are always <576bytes so they should go through
932 	 * unfragmented).
933 	 */
934 	if (sk->state == TCP_LISTEN)
935 		return;
936 
937 	/* We don't check in the destentry if pmtu discovery is forbidden
938 	 * on this route. We just assume that no packet_to_big packets
939 	 * are send back when pmtu discovery is not active.
940      	 * There is a small race when the user changes this flag in the
941 	 * route, but I think that's acceptable.
942 	 */
943 	if ((dst = __sk_dst_check(sk, 0)) == NULL)
944 		return;
945 
946 	ip_rt_update_pmtu(dst, mtu);
947 
948 	/* Something is about to be wrong... Remember soft error
949 	 * for the case, if this connection will not able to recover.
950 	 */
951 	if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952 		sk->err_soft = EMSGSIZE;
953 
954 	if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955 	    tp->pmtu_cookie > dst->pmtu) {
956 		tcp_sync_mss(sk, dst->pmtu);
957 
958 		/* Resend the TCP packet because it's
959 		 * clear that the old packet has been
960 		 * dropped. This is the new "fast" path mtu
961 		 * discovery.
962 		 */
963 		tcp_simple_retransmit(sk);
964 	} /* else let the usual retransmit timer handle it */
965 }
966 
967 /*
968  * This routine is called by the ICMP module when it gets some
969  * sort of error condition.  If err < 0 then the socket should
970  * be closed and the error returned to the user.  If err > 0
971  * it's just the icmp type << 8 | icmp code.  After adjustment
972  * header points to the first 8 bytes of the tcp header.  We need
973  * to find the appropriate port.
974  *
975  * The locking strategy used here is very "optimistic". When
976  * someone else accesses the socket the ICMP is just dropped
977  * and for some paths there is no check at all.
978  * A more general error queue to queue errors for later handling
979  * is probably better.
980  *
981  */
982 
tcp_v4_err(struct sk_buff * skb,u32 info)983 void tcp_v4_err(struct sk_buff *skb, u32 info)
984 {
985 	struct iphdr *iph = (struct iphdr*)skb->data;
986 	struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
987 	struct tcp_opt *tp;
988 	int type = skb->h.icmph->type;
989 	int code = skb->h.icmph->code;
990 	struct sock *sk;
991 	__u32 seq;
992 	int err;
993 
994 	if (skb->len < (iph->ihl << 2) + 8) {
995 		ICMP_INC_STATS_BH(IcmpInErrors);
996 		return;
997 	}
998 
999 	sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1000 	if (sk == NULL) {
1001 		ICMP_INC_STATS_BH(IcmpInErrors);
1002 		return;
1003 	}
1004 	if (sk->state == TCP_TIME_WAIT) {
1005 		tcp_tw_put((struct tcp_tw_bucket*)sk);
1006 		return;
1007 	}
1008 
1009 	bh_lock_sock(sk);
1010 	/* If too many ICMPs get dropped on busy
1011 	 * servers this needs to be solved differently.
1012 	 */
1013 	if (sk->lock.users != 0)
1014 		NET_INC_STATS_BH(LockDroppedIcmps);
1015 
1016 	if (sk->state == TCP_CLOSE)
1017 		goto out;
1018 
1019 	tp = &sk->tp_pinfo.af_tcp;
1020 	seq = ntohl(th->seq);
1021 	if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022 		NET_INC_STATS(OutOfWindowIcmps);
1023 		goto out;
1024 	}
1025 
1026 	switch (type) {
1027 	case ICMP_SOURCE_QUENCH:
1028 		/* Just silently ignore these. */
1029 		goto out;
1030 	case ICMP_PARAMETERPROB:
1031 		err = EPROTO;
1032 		break;
1033 	case ICMP_DEST_UNREACH:
1034 		if (code > NR_ICMP_UNREACH)
1035 			goto out;
1036 
1037 		if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1038 			if (sk->lock.users == 0)
1039 				do_pmtu_discovery(sk, iph, info);
1040 			goto out;
1041 		}
1042 
1043 		err = icmp_err_convert[code].errno;
1044 		break;
1045 	case ICMP_TIME_EXCEEDED:
1046 		err = EHOSTUNREACH;
1047 		break;
1048 	default:
1049 		goto out;
1050 	}
1051 
1052 	switch (sk->state) {
1053 		struct open_request *req, **prev;
1054 	case TCP_LISTEN:
1055 		if (sk->lock.users != 0)
1056 			goto out;
1057 
1058 		req = tcp_v4_search_req(tp, &prev,
1059 					th->dest,
1060 					iph->daddr, iph->saddr);
1061 		if (!req)
1062 			goto out;
1063 
1064 		/* ICMPs are not backlogged, hence we cannot get
1065 		   an established socket here.
1066 		 */
1067 		BUG_TRAP(req->sk == NULL);
1068 
1069 		if (seq != req->snt_isn) {
1070 			NET_INC_STATS_BH(OutOfWindowIcmps);
1071 			goto out;
1072 		}
1073 
1074 		/*
1075 		 * Still in SYN_RECV, just remove it silently.
1076 		 * There is no good way to pass the error to the newly
1077 		 * created socket, and POSIX does not want network
1078 		 * errors returned from accept().
1079 		 */
1080 		tcp_synq_drop(sk, req, prev);
1081 		goto out;
1082 
1083 	case TCP_SYN_SENT:
1084 	case TCP_SYN_RECV:  /* Cannot happen.
1085 			       It can f.e. if SYNs crossed.
1086 			     */
1087 		if (sk->lock.users == 0) {
1088 			TCP_INC_STATS_BH(TcpAttemptFails);
1089 			sk->err = err;
1090 
1091 			sk->error_report(sk);
1092 
1093 			tcp_done(sk);
1094 		} else {
1095 			sk->err_soft = err;
1096 		}
1097 		goto out;
1098 	}
1099 
1100 	/* If we've already connected we will keep trying
1101 	 * until we time out, or the user gives up.
1102 	 *
1103 	 * rfc1122 4.2.3.9 allows to consider as hard errors
1104 	 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1105 	 * but it is obsoleted by pmtu discovery).
1106 	 *
1107 	 * Note, that in modern internet, where routing is unreliable
1108 	 * and in each dark corner broken firewalls sit, sending random
1109 	 * errors ordered by their masters even this two messages finally lose
1110 	 * their original sense (even Linux sends invalid PORT_UNREACHs)
1111 	 *
1112 	 * Now we are in compliance with RFCs.
1113 	 *							--ANK (980905)
1114 	 */
1115 
1116 	if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1117 		sk->err = err;
1118 		sk->error_report(sk);
1119 	} else	{ /* Only an error on timeout */
1120 		sk->err_soft = err;
1121 	}
1122 
1123 out:
1124 	bh_unlock_sock(sk);
1125 	sock_put(sk);
1126 }
1127 
1128 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct tcphdr * th,int len,struct sk_buff * skb)1129 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1130 		       struct sk_buff *skb)
1131 {
1132 	if (skb->ip_summed == CHECKSUM_HW) {
1133 		th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1134 		skb->csum = offsetof(struct tcphdr, check);
1135 	} else {
1136 		th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1137 					 csum_partial((char *)th, th->doff<<2, skb->csum));
1138 	}
1139 }
1140 
1141 /*
1142  *	This routine will send an RST to the other tcp.
1143  *
1144  *	Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1145  *		      for reset.
1146  *	Answer: if a packet caused RST, it is not for a socket
1147  *		existing in our system, if it is matched to a socket,
1148  *		it is just duplicate segment or bug in other side's TCP.
1149  *		So that we build reply only basing on parameters
1150  *		arrived with segment.
1151  *	Exception: precedence violation. We do not implement it in any case.
1152  */
1153 
tcp_v4_send_reset(struct sk_buff * skb)1154 static void tcp_v4_send_reset(struct sk_buff *skb)
1155 {
1156 	struct tcphdr *th = skb->h.th;
1157 	struct tcphdr rth;
1158 	struct ip_reply_arg arg;
1159 
1160 	/* Never send a reset in response to a reset. */
1161 	if (th->rst)
1162 		return;
1163 
1164 	if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1165 		return;
1166 
1167 	/* Swap the send and the receive. */
1168 	memset(&rth, 0, sizeof(struct tcphdr));
1169 	rth.dest = th->source;
1170 	rth.source = th->dest;
1171 	rth.doff = sizeof(struct tcphdr)/4;
1172 	rth.rst = 1;
1173 
1174 	if (th->ack) {
1175 		rth.seq = th->ack_seq;
1176 	} else {
1177 		rth.ack = 1;
1178 		rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1179 				    + skb->len - (th->doff<<2));
1180 	}
1181 
1182 	memset(&arg, 0, sizeof arg);
1183 	arg.iov[0].iov_base = (unsigned char *)&rth;
1184 	arg.iov[0].iov_len  = sizeof rth;
1185 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1186 				      skb->nh.iph->saddr, /*XXX*/
1187 				      sizeof(struct tcphdr),
1188 				      IPPROTO_TCP,
1189 				      0);
1190 	arg.n_iov = 1;
1191 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1192 
1193 	tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1194 	ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1195 
1196 	TCP_INC_STATS_BH(TcpOutSegs);
1197 	TCP_INC_STATS_BH(TcpOutRsts);
1198 }
1199 
1200 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1201    outside socket context is ugly, certainly. What can I do?
1202  */
1203 
tcp_v4_send_ack(struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 ts)1204 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1205 {
1206 	struct tcphdr *th = skb->h.th;
1207 	struct {
1208 		struct tcphdr th;
1209 		u32 tsopt[3];
1210 	} rep;
1211 	struct ip_reply_arg arg;
1212 
1213 	memset(&rep.th, 0, sizeof(struct tcphdr));
1214 	memset(&arg, 0, sizeof arg);
1215 
1216 	arg.iov[0].iov_base = (unsigned char *)&rep;
1217 	arg.iov[0].iov_len  = sizeof(rep.th);
1218 	arg.n_iov = 1;
1219 	if (ts) {
1220 		rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1221 				     (TCPOPT_NOP << 16) |
1222 				     (TCPOPT_TIMESTAMP << 8) |
1223 				     TCPOLEN_TIMESTAMP);
1224 		rep.tsopt[1] = htonl(tcp_time_stamp);
1225 		rep.tsopt[2] = htonl(ts);
1226 		arg.iov[0].iov_len = sizeof(rep);
1227 	}
1228 
1229 	/* Swap the send and the receive. */
1230 	rep.th.dest = th->source;
1231 	rep.th.source = th->dest;
1232 	rep.th.doff = arg.iov[0].iov_len/4;
1233 	rep.th.seq = htonl(seq);
1234 	rep.th.ack_seq = htonl(ack);
1235 	rep.th.ack = 1;
1236 	rep.th.window = htons(win);
1237 
1238 	arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1239 				      skb->nh.iph->saddr, /*XXX*/
1240 				      arg.iov[0].iov_len,
1241 				      IPPROTO_TCP,
1242 				      0);
1243 	arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1244 
1245 	ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1246 
1247 	TCP_INC_STATS_BH(TcpOutSegs);
1248 }
1249 
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1250 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1251 {
1252 	struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1253 
1254 	tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1255 			tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1256 
1257 	tcp_tw_put(tw);
1258 }
1259 
tcp_v4_or_send_ack(struct sk_buff * skb,struct open_request * req)1260 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1261 {
1262 	tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1263 			req->ts_recent);
1264 }
1265 
tcp_v4_route_req(struct sock * sk,struct open_request * req)1266 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1267 {
1268 	struct rtable *rt;
1269 	struct ip_options *opt;
1270 
1271 	opt = req->af.v4_req.opt;
1272 	if(ip_route_output(&rt, ((opt && opt->srr) ?
1273 				 opt->faddr :
1274 				 req->af.v4_req.rmt_addr),
1275 			   req->af.v4_req.loc_addr,
1276 			   RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1277 		IP_INC_STATS_BH(IpOutNoRoutes);
1278 		return NULL;
1279 	}
1280 	if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1281 		ip_rt_put(rt);
1282 		IP_INC_STATS_BH(IpOutNoRoutes);
1283 		return NULL;
1284 	}
1285 	return &rt->u.dst;
1286 }
1287 
1288 /*
1289  *	Send a SYN-ACK after having received an ACK.
1290  *	This still operates on a open_request only, not on a big
1291  *	socket.
1292  */
tcp_v4_send_synack(struct sock * sk,struct open_request * req,struct dst_entry * dst)1293 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1294 			      struct dst_entry *dst)
1295 {
1296 	int err = -1;
1297 	struct sk_buff * skb;
1298 
1299 	/* First, grab a route. */
1300 	if (dst == NULL &&
1301 	    (dst = tcp_v4_route_req(sk, req)) == NULL)
1302 		goto out;
1303 
1304 	skb = tcp_make_synack(sk, dst, req);
1305 
1306 	if (skb) {
1307 		struct tcphdr *th = skb->h.th;
1308 
1309 		th->check = tcp_v4_check(th, skb->len,
1310 					 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1311 					 csum_partial((char *)th, skb->len, skb->csum));
1312 
1313 		err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1314 					    req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1315 		if (err == NET_XMIT_CN)
1316 			err = 0;
1317 	}
1318 
1319 out:
1320 	dst_release(dst);
1321 	return err;
1322 }
1323 
1324 /*
1325  *	IPv4 open_request destructor.
1326  */
tcp_v4_or_free(struct open_request * req)1327 static void tcp_v4_or_free(struct open_request *req)
1328 {
1329 	if (req->af.v4_req.opt)
1330 		kfree(req->af.v4_req.opt);
1331 }
1332 
syn_flood_warning(struct sk_buff * skb)1333 static inline void syn_flood_warning(struct sk_buff *skb)
1334 {
1335 	static unsigned long warntime;
1336 
1337 	if (jiffies - warntime > HZ*60) {
1338 		warntime = jiffies;
1339 		printk(KERN_INFO
1340 		       "possible SYN flooding on port %d. Sending cookies.\n",
1341 		       ntohs(skb->h.th->dest));
1342 	}
1343 }
1344 
1345 /*
1346  * Save and compile IPv4 options into the open_request if needed.
1347  */
1348 static inline struct ip_options *
tcp_v4_save_options(struct sock * sk,struct sk_buff * skb)1349 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1350 {
1351 	struct ip_options *opt = &(IPCB(skb)->opt);
1352 	struct ip_options *dopt = NULL;
1353 
1354 	if (opt && opt->optlen) {
1355 		int opt_size = optlength(opt);
1356 		dopt = kmalloc(opt_size, GFP_ATOMIC);
1357 		if (dopt) {
1358 			if (ip_options_echo(dopt, skb)) {
1359 				kfree(dopt);
1360 				dopt = NULL;
1361 			}
1362 		}
1363 	}
1364 	return dopt;
1365 }
1366 
1367 /*
1368  * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1369  * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1370  * It would be better to replace it with a global counter for all sockets
1371  * but then some measure against one socket starving all other sockets
1372  * would be needed.
1373  *
1374  * It was 128 by default. Experiments with real servers show, that
1375  * it is absolutely not enough even at 100conn/sec. 256 cures most
1376  * of problems. This value is adjusted to 128 for very small machines
1377  * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1378  * Further increasing requires to change hash table size.
1379  */
1380 int sysctl_max_syn_backlog = 256;
1381 
1382 struct or_calltable or_ipv4 = {
1383 	PF_INET,
1384 	tcp_v4_send_synack,
1385 	tcp_v4_or_send_ack,
1386 	tcp_v4_or_free,
1387 	tcp_v4_send_reset
1388 };
1389 
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1390 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1391 {
1392 	struct tcp_opt tp;
1393 	struct open_request *req;
1394 	__u32 saddr = skb->nh.iph->saddr;
1395 	__u32 daddr = skb->nh.iph->daddr;
1396 	__u32 isn = TCP_SKB_CB(skb)->when;
1397 	struct dst_entry *dst = NULL;
1398 #ifdef CONFIG_SYN_COOKIES
1399 	int want_cookie = 0;
1400 #else
1401 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1402 #endif
1403 
1404 	/* Never answer to SYNs send to broadcast or multicast */
1405 	if (((struct rtable *)skb->dst)->rt_flags &
1406 	    (RTCF_BROADCAST|RTCF_MULTICAST))
1407 		goto drop;
1408 
1409 	/* TW buckets are converted to open requests without
1410 	 * limitations, they conserve resources and peer is
1411 	 * evidently real one.
1412 	 */
1413 	if (tcp_synq_is_full(sk) && !isn) {
1414 #ifdef CONFIG_SYN_COOKIES
1415 		if (sysctl_tcp_syncookies) {
1416 			want_cookie = 1;
1417 		} else
1418 #endif
1419 		goto drop;
1420 	}
1421 
1422 	/* Accept backlog is full. If we have already queued enough
1423 	 * of warm entries in syn queue, drop request. It is better than
1424 	 * clogging syn queue with openreqs with exponentially increasing
1425 	 * timeout.
1426 	 */
1427 	if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1428 		goto drop;
1429 
1430 	req = tcp_openreq_alloc();
1431 	if (req == NULL)
1432 		goto drop;
1433 
1434 	tcp_clear_options(&tp);
1435 	tp.mss_clamp = 536;
1436 	tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1437 
1438 	tcp_parse_options(skb, &tp, 0);
1439 
1440 	if (want_cookie) {
1441 		tcp_clear_options(&tp);
1442 		tp.saw_tstamp = 0;
1443 	}
1444 
1445 	if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1446 		/* Some OSes (unknown ones, but I see them on web server, which
1447 		 * contains information interesting only for windows'
1448 		 * users) do not send their stamp in SYN. It is easy case.
1449 		 * We simply do not advertise TS support.
1450 		 */
1451 		tp.saw_tstamp = 0;
1452 		tp.tstamp_ok = 0;
1453 	}
1454 	tp.tstamp_ok = tp.saw_tstamp;
1455 
1456 	tcp_openreq_init(req, &tp, skb);
1457 
1458 	req->af.v4_req.loc_addr = daddr;
1459 	req->af.v4_req.rmt_addr = saddr;
1460 	req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1461 	req->class = &or_ipv4;
1462 	if (!want_cookie)
1463 		TCP_ECN_create_request(req, skb->h.th);
1464 
1465 	if (want_cookie) {
1466 #ifdef CONFIG_SYN_COOKIES
1467 		syn_flood_warning(skb);
1468 #endif
1469 		isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1470 	} else if (isn == 0) {
1471 		struct inet_peer *peer = NULL;
1472 
1473 		/* VJ's idea. We save last timestamp seen
1474 		 * from the destination in peer table, when entering
1475 		 * state TIME-WAIT, and check against it before
1476 		 * accepting new connection request.
1477 		 *
1478 		 * If "isn" is not zero, this request hit alive
1479 		 * timewait bucket, so that all the necessary checks
1480 		 * are made in the function processing timewait state.
1481 		 */
1482 		if (tp.saw_tstamp &&
1483 		    sysctl_tcp_tw_recycle &&
1484 		    (dst = tcp_v4_route_req(sk, req)) != NULL &&
1485 		    (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1486 		    peer->v4daddr == saddr) {
1487 			if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1488 			    (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1489 				NET_INC_STATS_BH(PAWSPassiveRejected);
1490 				dst_release(dst);
1491 				goto drop_and_free;
1492 			}
1493 		}
1494 		/* Kill the following clause, if you dislike this way. */
1495 		else if (!sysctl_tcp_syncookies &&
1496 			 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1497 			  < (sysctl_max_syn_backlog>>2)) &&
1498 			 (!peer || !peer->tcp_ts_stamp) &&
1499 			 (!dst || !dst->rtt)) {
1500 			/* Without syncookies last quarter of
1501 			 * backlog is filled with destinations, proven to be alive.
1502 			 * It means that we continue to communicate
1503 			 * to destinations, already remembered
1504 			 * to the moment of synflood.
1505 			 */
1506 			NETDEBUG(if (net_ratelimit()) \
1507 				printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1508 					NIPQUAD(saddr), ntohs(skb->h.th->source)));
1509 			dst_release(dst);
1510 			goto drop_and_free;
1511 		}
1512 
1513 		isn = tcp_v4_init_sequence(sk, skb);
1514 	}
1515 	req->snt_isn = isn;
1516 
1517 	if (tcp_v4_send_synack(sk, req, dst))
1518 		goto drop_and_free;
1519 
1520 	if (want_cookie) {
1521 	   	tcp_openreq_free(req);
1522 	} else {
1523 		tcp_v4_synq_add(sk, req);
1524 	}
1525 	return 0;
1526 
1527 drop_and_free:
1528 	tcp_openreq_free(req);
1529 drop:
1530 	TCP_INC_STATS_BH(TcpAttemptFails);
1531 	return 0;
1532 }
1533 
1534 
1535 /*
1536  * The three way handshake has completed - we got a valid synack -
1537  * now create the new socket.
1538  */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct open_request * req,struct dst_entry * dst)1539 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1540 				   struct open_request *req,
1541 				   struct dst_entry *dst)
1542 {
1543 	struct tcp_opt *newtp;
1544 	struct sock *newsk;
1545 
1546 	if (tcp_acceptq_is_full(sk))
1547 		goto exit_overflow;
1548 
1549 	if (dst == NULL &&
1550 	    (dst = tcp_v4_route_req(sk, req)) == NULL)
1551 		goto exit;
1552 
1553 	newsk = tcp_create_openreq_child(sk, req, skb);
1554 	if (!newsk)
1555 		goto exit;
1556 
1557 	newsk->dst_cache = dst;
1558 	newsk->route_caps = dst->dev->features;
1559 
1560 	newtp = &(newsk->tp_pinfo.af_tcp);
1561 	newsk->daddr = req->af.v4_req.rmt_addr;
1562 	newsk->saddr = req->af.v4_req.loc_addr;
1563 	newsk->rcv_saddr = req->af.v4_req.loc_addr;
1564 	newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1565 	req->af.v4_req.opt = NULL;
1566 	newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1567 	newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1568 	newtp->ext_header_len = 0;
1569 	if (newsk->protinfo.af_inet.opt)
1570 		newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1571 	newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1572 
1573 	tcp_sync_mss(newsk, dst->pmtu);
1574 	newtp->advmss = dst->advmss;
1575 	tcp_initialize_rcv_mss(newsk);
1576 
1577 	__tcp_v4_hash(newsk, 0);
1578 	__tcp_inherit_port(sk, newsk);
1579 
1580 	return newsk;
1581 
1582 exit_overflow:
1583 	NET_INC_STATS_BH(ListenOverflows);
1584 exit:
1585 	NET_INC_STATS_BH(ListenDrops);
1586 	dst_release(dst);
1587 	return NULL;
1588 }
1589 
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1590 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1591 {
1592 	struct open_request *req, **prev;
1593 	struct tcphdr *th = skb->h.th;
1594 	struct iphdr *iph = skb->nh.iph;
1595 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1596 	struct sock *nsk;
1597 
1598 	/* Find possible connection requests. */
1599 	req = tcp_v4_search_req(tp, &prev,
1600 				th->source,
1601 				iph->saddr, iph->daddr);
1602 	if (req)
1603 		return tcp_check_req(sk, skb, req, prev);
1604 
1605 	nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1606 					  th->source,
1607 					  skb->nh.iph->daddr,
1608 					  ntohs(th->dest),
1609 					  tcp_v4_iif(skb));
1610 
1611 	if (nsk) {
1612 		if (nsk->state != TCP_TIME_WAIT) {
1613 			bh_lock_sock(nsk);
1614 			return nsk;
1615 		}
1616 		tcp_tw_put((struct tcp_tw_bucket*)nsk);
1617 		return NULL;
1618 	}
1619 
1620 #ifdef CONFIG_SYN_COOKIES
1621 	if (!th->rst && !th->syn && th->ack)
1622 		sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1623 #endif
1624 	return sk;
1625 }
1626 
tcp_v4_checksum_init(struct sk_buff * skb)1627 static int tcp_v4_checksum_init(struct sk_buff *skb)
1628 {
1629 	if (skb->ip_summed == CHECKSUM_HW) {
1630 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1631 		if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1632 				  skb->nh.iph->daddr,skb->csum))
1633 			return 0;
1634 
1635 		NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1636 		skb->ip_summed = CHECKSUM_NONE;
1637 	}
1638 	if (skb->len <= 76) {
1639 		if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1640 				 skb->nh.iph->daddr,
1641 				 skb_checksum(skb, 0, skb->len, 0)))
1642 			return -1;
1643 		skb->ip_summed = CHECKSUM_UNNECESSARY;
1644 	} else {
1645 		skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1646 					  skb->nh.iph->daddr,0);
1647 	}
1648 	return 0;
1649 }
1650 
1651 
1652 /* The socket must have it's spinlock held when we get
1653  * here.
1654  *
1655  * We have a potential double-lock case here, so even when
1656  * doing backlog processing we use the BH locking scheme.
1657  * This is because we cannot sleep with the original spinlock
1658  * held.
1659  */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662   	IP_INC_STATS_BH(IpInDelivers);
1663 
1664 	if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1665 		TCP_CHECK_TIMER(sk);
1666 		if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1667 			goto reset;
1668 		TCP_CHECK_TIMER(sk);
1669 		return 0;
1670 	}
1671 
1672 	if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1673 		goto csum_err;
1674 
1675 	if (sk->state == TCP_LISTEN) {
1676 		struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1677 		if (!nsk)
1678 			goto discard;
1679 
1680 		if (nsk != sk) {
1681 			if (tcp_child_process(sk, nsk, skb))
1682 				goto reset;
1683 			return 0;
1684 		}
1685 	}
1686 
1687 	TCP_CHECK_TIMER(sk);
1688 	if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1689 		goto reset;
1690 	TCP_CHECK_TIMER(sk);
1691 	return 0;
1692 
1693 reset:
1694 	tcp_v4_send_reset(skb);
1695 discard:
1696 	kfree_skb(skb);
1697 	/* Be careful here. If this function gets more complicated and
1698 	 * gcc suffers from register pressure on the x86, sk (in %ebx)
1699 	 * might be destroyed here. This current version compiles correctly,
1700 	 * but you have been warned.
1701 	 */
1702 	return 0;
1703 
1704 csum_err:
1705 	TCP_INC_STATS_BH(TcpInErrs);
1706 	goto discard;
1707 }
1708 
1709 /*
1710  *	From tcp_input.c
1711  */
1712 
tcp_v4_rcv(struct sk_buff * skb)1713 int tcp_v4_rcv(struct sk_buff *skb)
1714 {
1715 	struct tcphdr *th;
1716 	struct sock *sk;
1717 	int ret;
1718 
1719 	if (skb->pkt_type!=PACKET_HOST)
1720 		goto discard_it;
1721 
1722 	/* Count it even if it's bad */
1723 	TCP_INC_STATS_BH(TcpInSegs);
1724 
1725 	if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1726 		goto discard_it;
1727 
1728 	th = skb->h.th;
1729 
1730 	if (th->doff < sizeof(struct tcphdr)/4)
1731 		goto bad_packet;
1732 	if (!pskb_may_pull(skb, th->doff*4))
1733 		goto discard_it;
1734 
1735 	/* An explanation is required here, I think.
1736 	 * Packet length and doff are validated by header prediction,
1737 	 * provided case of th->doff==0 is elimineted.
1738 	 * So, we defer the checks. */
1739 	if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1740 	     tcp_v4_checksum_init(skb) < 0))
1741 		goto bad_packet;
1742 
1743 	th = skb->h.th;
1744 	TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1745 	TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1746 				    skb->len - th->doff*4);
1747 	TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1748 	TCP_SKB_CB(skb)->when = 0;
1749 	TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1750 	TCP_SKB_CB(skb)->sacked = 0;
1751 
1752 	sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1753 			     skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1754 
1755 	if (!sk)
1756 		goto no_tcp_socket;
1757 
1758 process:
1759 	if(!ipsec_sk_policy(sk,skb))
1760 		goto discard_and_relse;
1761 
1762 	if (sk->state == TCP_TIME_WAIT)
1763 		goto do_time_wait;
1764 
1765 	if (sk_filter(sk, skb, 0))
1766 		goto discard_and_relse;
1767 
1768 	skb->dev = NULL;
1769 
1770 	bh_lock_sock(sk);
1771 	ret = 0;
1772 	if (!sk->lock.users) {
1773 		if (!tcp_prequeue(sk, skb))
1774 			ret = tcp_v4_do_rcv(sk, skb);
1775 	} else
1776 		sk_add_backlog(sk, skb);
1777 	bh_unlock_sock(sk);
1778 
1779 	sock_put(sk);
1780 
1781 	return ret;
1782 
1783 no_tcp_socket:
1784 	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1785 bad_packet:
1786 		TCP_INC_STATS_BH(TcpInErrs);
1787 	} else {
1788 		tcp_v4_send_reset(skb);
1789 	}
1790 
1791 discard_it:
1792 	/* Discard frame. */
1793 	kfree_skb(skb);
1794   	return 0;
1795 
1796 discard_and_relse:
1797 	sock_put(sk);
1798 	goto discard_it;
1799 
1800 do_time_wait:
1801 	if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1802 		TCP_INC_STATS_BH(TcpInErrs);
1803 		tcp_tw_put((struct tcp_tw_bucket *) sk);
1804 		goto discard_it;
1805 	}
1806 	switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1807 					  skb, th, skb->len)) {
1808 	case TCP_TW_SYN:
1809 	{
1810 		struct sock *sk2;
1811 
1812 		sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1813 		if (sk2 != NULL) {
1814 			tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1815 			tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1816 			tcp_tw_put((struct tcp_tw_bucket *)sk);
1817 			sk = sk2;
1818 			goto process;
1819 		}
1820 		/* Fall through to ACK */
1821 	}
1822 	case TCP_TW_ACK:
1823 		tcp_v4_timewait_ack(sk, skb);
1824 		break;
1825 	case TCP_TW_RST:
1826 		goto no_tcp_socket;
1827 	case TCP_TW_SUCCESS:;
1828 	}
1829 	goto discard_it;
1830 }
1831 
1832 /* With per-bucket locks this operation is not-atomic, so that
1833  * this version is not worse.
1834  */
__tcp_v4_rehash(struct sock * sk)1835 static void __tcp_v4_rehash(struct sock *sk)
1836 {
1837 	sk->prot->unhash(sk);
1838 	sk->prot->hash(sk);
1839 }
1840 
tcp_v4_reselect_saddr(struct sock * sk)1841 static int tcp_v4_reselect_saddr(struct sock *sk)
1842 {
1843 	int err;
1844 	struct rtable *rt;
1845 	__u32 old_saddr = sk->saddr;
1846 	__u32 new_saddr;
1847 	__u32 daddr = sk->daddr;
1848 
1849 	if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1850 		daddr = sk->protinfo.af_inet.opt->faddr;
1851 
1852 	/* Query new route. */
1853 	err = ip_route_connect(&rt, daddr, 0,
1854 			       RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1855 			       sk->bound_dev_if);
1856 	if (err)
1857 		return err;
1858 
1859 	__sk_dst_set(sk, &rt->u.dst);
1860 	sk->route_caps = rt->u.dst.dev->features;
1861 
1862 	new_saddr = rt->rt_src;
1863 
1864 	if (new_saddr == old_saddr)
1865 		return 0;
1866 
1867 	if (sysctl_ip_dynaddr > 1) {
1868 		printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1869 		       "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1870 		       NIPQUAD(old_saddr),
1871 		       NIPQUAD(new_saddr));
1872 	}
1873 
1874 	sk->saddr = new_saddr;
1875 	sk->rcv_saddr = new_saddr;
1876 
1877 	/* XXX The only one ugly spot where we need to
1878 	 * XXX really change the sockets identity after
1879 	 * XXX it has entered the hashes. -DaveM
1880 	 *
1881 	 * Besides that, it does not check for connection
1882 	 * uniqueness. Wait for troubles.
1883 	 */
1884 	__tcp_v4_rehash(sk);
1885 	return 0;
1886 }
1887 
tcp_v4_rebuild_header(struct sock * sk)1888 int tcp_v4_rebuild_header(struct sock *sk)
1889 {
1890 	struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1891 	u32 daddr;
1892 	int err;
1893 
1894 	/* Route is OK, nothing to do. */
1895 	if (rt != NULL)
1896 		return 0;
1897 
1898 	/* Reroute. */
1899 	daddr = sk->daddr;
1900 	if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1901 		daddr = sk->protinfo.af_inet.opt->faddr;
1902 
1903 	err = ip_route_output(&rt, daddr, sk->saddr,
1904 			      RT_CONN_FLAGS(sk), sk->bound_dev_if);
1905 	if (!err) {
1906 		__sk_dst_set(sk, &rt->u.dst);
1907 		sk->route_caps = rt->u.dst.dev->features;
1908 		return 0;
1909 	}
1910 
1911 	/* Routing failed... */
1912 	sk->route_caps = 0;
1913 
1914 	if (!sysctl_ip_dynaddr ||
1915 	    sk->state != TCP_SYN_SENT ||
1916 	    (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1917 	    (err = tcp_v4_reselect_saddr(sk)) != 0)
1918 		sk->err_soft=-err;
1919 
1920 	return err;
1921 }
1922 
v4_addr2sockaddr(struct sock * sk,struct sockaddr * uaddr)1923 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1924 {
1925 	struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1926 
1927 	sin->sin_family		= AF_INET;
1928 	sin->sin_addr.s_addr	= sk->daddr;
1929 	sin->sin_port		= sk->dport;
1930 }
1931 
1932 /* VJ's idea. Save last timestamp seen from this destination
1933  * and hold it at least for normal timewait interval to use for duplicate
1934  * segment detection in subsequent connections, before they enter synchronized
1935  * state.
1936  */
1937 
tcp_v4_remember_stamp(struct sock * sk)1938 int tcp_v4_remember_stamp(struct sock *sk)
1939 {
1940 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1941 	struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1942 	struct inet_peer *peer = NULL;
1943 	int release_it = 0;
1944 
1945 	if (rt == NULL || rt->rt_dst != sk->daddr) {
1946 		peer = inet_getpeer(sk->daddr, 1);
1947 		release_it = 1;
1948 	} else {
1949 		if (rt->peer == NULL)
1950 			rt_bind_peer(rt, 1);
1951 		peer = rt->peer;
1952 	}
1953 
1954 	if (peer) {
1955 		if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1956 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1957 		     peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1958 			peer->tcp_ts_stamp = tp->ts_recent_stamp;
1959 			peer->tcp_ts = tp->ts_recent;
1960 		}
1961 		if (release_it)
1962 			inet_putpeer(peer);
1963 		return 1;
1964 	}
1965 
1966 	return 0;
1967 }
1968 
tcp_v4_tw_remember_stamp(struct tcp_tw_bucket * tw)1969 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1970 {
1971 	struct inet_peer *peer = NULL;
1972 
1973 	peer = inet_getpeer(tw->daddr, 1);
1974 
1975 	if (peer) {
1976 		if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1977 		    (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1978 		     peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1979 			peer->tcp_ts_stamp = tw->ts_recent_stamp;
1980 			peer->tcp_ts = tw->ts_recent;
1981 		}
1982 		inet_putpeer(peer);
1983 		return 1;
1984 	}
1985 
1986 	return 0;
1987 }
1988 
1989 struct tcp_func ipv4_specific = {
1990 	ip_queue_xmit,
1991 	tcp_v4_send_check,
1992 	tcp_v4_rebuild_header,
1993 	tcp_v4_conn_request,
1994 	tcp_v4_syn_recv_sock,
1995 	tcp_v4_remember_stamp,
1996 	sizeof(struct iphdr),
1997 
1998 	ip_setsockopt,
1999 	ip_getsockopt,
2000 	v4_addr2sockaddr,
2001 	sizeof(struct sockaddr_in)
2002 };
2003 
2004 /* NOTE: A lot of things set to zero explicitly by call to
2005  *       sk_alloc() so need not be done here.
2006  */
tcp_v4_init_sock(struct sock * sk)2007 static int tcp_v4_init_sock(struct sock *sk)
2008 {
2009 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2010 
2011 	skb_queue_head_init(&tp->out_of_order_queue);
2012 	tcp_init_xmit_timers(sk);
2013 	tcp_prequeue_init(tp);
2014 
2015 	tp->rto  = TCP_TIMEOUT_INIT;
2016 	tp->mdev = TCP_TIMEOUT_INIT;
2017 
2018 	/* So many TCP implementations out there (incorrectly) count the
2019 	 * initial SYN frame in their delayed-ACK and congestion control
2020 	 * algorithms that we must have the following bandaid to talk
2021 	 * efficiently to them.  -DaveM
2022 	 */
2023 	tp->snd_cwnd = 2;
2024 
2025 	/* See draft-stevens-tcpca-spec-01 for discussion of the
2026 	 * initialization of these values.
2027 	 */
2028 	tp->snd_ssthresh = 0x7fffffff;	/* Infinity */
2029 	tp->snd_cwnd_clamp = ~0;
2030 	tp->mss_cache = 536;
2031 
2032 	tp->reordering = sysctl_tcp_reordering;
2033 
2034 	sk->state = TCP_CLOSE;
2035 
2036 	sk->write_space = tcp_write_space;
2037 	sk->use_write_queue = 1;
2038 
2039 	sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2040 
2041 	sk->sndbuf = sysctl_tcp_wmem[1];
2042 	sk->rcvbuf = sysctl_tcp_rmem[1];
2043 
2044 	atomic_inc(&tcp_sockets_allocated);
2045 
2046 	return 0;
2047 }
2048 
tcp_v4_destroy_sock(struct sock * sk)2049 static int tcp_v4_destroy_sock(struct sock *sk)
2050 {
2051 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2052 
2053 	tcp_clear_xmit_timers(sk);
2054 
2055 	/* Cleanup up the write buffer. */
2056   	tcp_writequeue_purge(sk);
2057 
2058 	/* Cleans up our, hopefully empty, out_of_order_queue. */
2059   	__skb_queue_purge(&tp->out_of_order_queue);
2060 
2061 	/* Clean prequeue, it must be empty really */
2062 	__skb_queue_purge(&tp->ucopy.prequeue);
2063 
2064 	/* Clean up a referenced TCP bind bucket. */
2065 	if(sk->prev != NULL)
2066 		tcp_put_port(sk);
2067 
2068 	/* If sendmsg cached page exists, toss it. */
2069 	if (tp->sndmsg_page != NULL)
2070 		__free_page(tp->sndmsg_page);
2071 
2072 	atomic_dec(&tcp_sockets_allocated);
2073 
2074 	return 0;
2075 }
2076 
2077 /* Proc filesystem TCP sock list dumping. */
get_openreq(struct sock * sk,struct open_request * req,char * tmpbuf,int i,int uid)2078 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2079 {
2080 	int ttd = req->expires - jiffies;
2081 
2082 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2083 		" %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2084 		i,
2085 		req->af.v4_req.loc_addr,
2086 		ntohs(sk->sport),
2087 		req->af.v4_req.rmt_addr,
2088 		ntohs(req->rmt_port),
2089 		TCP_SYN_RECV,
2090 		0,0, /* could print option size, but that is af dependent. */
2091 		1,   /* timers active (only the expire timer) */
2092 		ttd,
2093 		req->retrans,
2094 		uid,
2095 		0,  /* non standard timer */
2096 		0, /* open_requests have no inode */
2097 		atomic_read(&sk->refcnt),
2098 		req
2099 		);
2100 }
2101 
get_tcp_sock(struct sock * sp,char * tmpbuf,int i)2102 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2103 {
2104 	unsigned int dest, src;
2105 	__u16 destp, srcp;
2106 	int timer_active;
2107 	unsigned long timer_expires;
2108 	struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2109 
2110 	dest  = sp->daddr;
2111 	src   = sp->rcv_saddr;
2112 	destp = ntohs(sp->dport);
2113 	srcp  = ntohs(sp->sport);
2114 	if (tp->pending == TCP_TIME_RETRANS) {
2115 		timer_active	= 1;
2116 		timer_expires	= tp->timeout;
2117 	} else if (tp->pending == TCP_TIME_PROBE0) {
2118 		timer_active	= 4;
2119 		timer_expires	= tp->timeout;
2120 	} else if (timer_pending(&sp->timer)) {
2121 		timer_active	= 2;
2122 		timer_expires	= sp->timer.expires;
2123 	} else {
2124 		timer_active	= 0;
2125 		timer_expires = jiffies;
2126 	}
2127 
2128 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2129 		" %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2130 		i, src, srcp, dest, destp, sp->state,
2131 		tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2132 		timer_active, timer_expires-jiffies,
2133 		tp->retransmits,
2134 		sock_i_uid(sp),
2135 		tp->probes_out,
2136 		sock_i_ino(sp),
2137 		atomic_read(&sp->refcnt), sp,
2138 		tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2139 		tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2140 		);
2141 }
2142 
get_timewait_sock(struct tcp_tw_bucket * tw,char * tmpbuf,int i)2143 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2144 {
2145 	unsigned int dest, src;
2146 	__u16 destp, srcp;
2147 	int ttd = tw->ttd - jiffies;
2148 
2149 	if (ttd < 0)
2150 		ttd = 0;
2151 
2152 	dest  = tw->daddr;
2153 	src   = tw->rcv_saddr;
2154 	destp = ntohs(tw->dport);
2155 	srcp  = ntohs(tw->sport);
2156 
2157 	sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2158 		" %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2159 		i, src, srcp, dest, destp, tw->substate, 0, 0,
2160 		3, ttd, 0, 0, 0, 0,
2161 		atomic_read(&tw->refcnt), tw);
2162 }
2163 
2164 #define TMPSZ 150
2165 
tcp_get_info(char * buffer,char ** start,off_t offset,int length)2166 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2167 {
2168 	int len = 0, num = 0, i;
2169 	off_t begin, pos = 0;
2170 	char tmpbuf[TMPSZ+1];
2171 
2172 	if (offset < TMPSZ)
2173 		len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2174 			       "  sl  local_address rem_address   st tx_queue "
2175 			       "rx_queue tr tm->when retrnsmt   uid  timeout inode");
2176 
2177 	pos = TMPSZ;
2178 
2179 	/* First, walk listening socket table. */
2180 	tcp_listen_lock();
2181 	for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2182 		struct sock *sk;
2183 		struct tcp_listen_opt *lopt;
2184 		int k;
2185 
2186 		for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2187 			struct open_request *req;
2188 			int uid;
2189 			struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2190 
2191 			if (!TCP_INET_FAMILY(sk->family))
2192 				goto skip_listen;
2193 
2194 			pos += TMPSZ;
2195 			if (pos >= offset) {
2196 				get_tcp_sock(sk, tmpbuf, num);
2197 				len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2198 				if (pos >= offset + length) {
2199 					tcp_listen_unlock();
2200 					goto out_no_bh;
2201 				}
2202 			}
2203 
2204 skip_listen:
2205 			uid = sock_i_uid(sk);
2206 			read_lock_bh(&tp->syn_wait_lock);
2207 			lopt = tp->listen_opt;
2208 			if (lopt && lopt->qlen != 0) {
2209 				for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2210 					for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2211 						if (!TCP_INET_FAMILY(req->class->family))
2212 							continue;
2213 
2214 						pos += TMPSZ;
2215 						if (pos <= offset)
2216 							continue;
2217 						get_openreq(sk, req, tmpbuf, num, uid);
2218 						len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2219 						if (pos >= offset + length) {
2220 							read_unlock_bh(&tp->syn_wait_lock);
2221 							tcp_listen_unlock();
2222 							goto out_no_bh;
2223 						}
2224 					}
2225 				}
2226 			}
2227 			read_unlock_bh(&tp->syn_wait_lock);
2228 
2229 			/* Completed requests are in normal socket hash table */
2230 		}
2231 	}
2232 	tcp_listen_unlock();
2233 
2234 	local_bh_disable();
2235 
2236 	/* Next, walk established hash chain. */
2237 	for (i = 0; i < tcp_ehash_size; i++) {
2238 		struct tcp_ehash_bucket *head = &tcp_ehash[i];
2239 		struct sock *sk;
2240 		struct tcp_tw_bucket *tw;
2241 
2242 		read_lock(&head->lock);
2243 		for(sk = head->chain; sk; sk = sk->next, num++) {
2244 			if (!TCP_INET_FAMILY(sk->family))
2245 				continue;
2246 			pos += TMPSZ;
2247 			if (pos <= offset)
2248 				continue;
2249 			get_tcp_sock(sk, tmpbuf, num);
2250 			len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2251 			if (pos >= offset + length) {
2252 				read_unlock(&head->lock);
2253 				goto out;
2254 			}
2255 		}
2256 		for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2257 		     tw != NULL;
2258 		     tw = (struct tcp_tw_bucket *)tw->next, num++) {
2259 			if (!TCP_INET_FAMILY(tw->family))
2260 				continue;
2261 			pos += TMPSZ;
2262 			if (pos <= offset)
2263 				continue;
2264 			get_timewait_sock(tw, tmpbuf, num);
2265 			len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2266 			if (pos >= offset + length) {
2267 				read_unlock(&head->lock);
2268 				goto out;
2269 			}
2270 		}
2271 		read_unlock(&head->lock);
2272 	}
2273 
2274 out:
2275 	local_bh_enable();
2276 out_no_bh:
2277 
2278 	begin = len - (pos - offset);
2279 	*start = buffer + begin;
2280 	len -= begin;
2281 	if (len > length)
2282 		len = length;
2283 	if (len < 0)
2284 		len = 0;
2285 	return len;
2286 }
2287 
2288 struct proto tcp_prot = {
2289 	name:		"TCP",
2290 	close:		tcp_close,
2291 	connect:	tcp_v4_connect,
2292 	disconnect:	tcp_disconnect,
2293 	accept:		tcp_accept,
2294 	ioctl:		tcp_ioctl,
2295 	init:		tcp_v4_init_sock,
2296 	destroy:	tcp_v4_destroy_sock,
2297 	shutdown:	tcp_shutdown,
2298 	setsockopt:	tcp_setsockopt,
2299 	getsockopt:	tcp_getsockopt,
2300 	sendmsg:	tcp_sendmsg,
2301 	recvmsg:	tcp_recvmsg,
2302 	backlog_rcv:	tcp_v4_do_rcv,
2303 	hash:		tcp_v4_hash,
2304 	unhash:		tcp_unhash,
2305 	get_port:	tcp_v4_get_port,
2306 };
2307 
2308 
2309 
tcp_v4_init(struct net_proto_family * ops)2310 void __init tcp_v4_init(struct net_proto_family *ops)
2311 {
2312 	int err;
2313 
2314 	tcp_inode.i_mode = S_IFSOCK;
2315 	tcp_inode.i_sock = 1;
2316 	tcp_inode.i_uid = 0;
2317 	tcp_inode.i_gid = 0;
2318 	init_waitqueue_head(&tcp_inode.i_wait);
2319 	init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2320 
2321 	tcp_socket->inode = &tcp_inode;
2322 	tcp_socket->state = SS_UNCONNECTED;
2323 	tcp_socket->type=SOCK_RAW;
2324 
2325 	if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2326 		panic("Failed to create the TCP control socket.\n");
2327 	tcp_socket->sk->allocation=GFP_ATOMIC;
2328 	tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2329 
2330 	/* Unhash it so that IP input processing does not even
2331 	 * see it, we do not wish this socket to see incoming
2332 	 * packets.
2333 	 */
2334 	tcp_socket->sk->prot->unhash(tcp_socket->sk);
2335 }
2336