1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp_ipv4.c,v 1.237.2.1 2002/01/15 08:49:49 davem Exp $
9 *
10 * IPv4 specific functions
11 *
12 *
13 * code split from:
14 * linux/ipv4/tcp.c
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
17 *
18 * See tcp.c for author information
19 *
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
24 */
25
26 /*
27 * Changes:
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an ACK bit.
36 * Andi Kleen : Implemented fast path mtu discovery.
37 * Fixed many serious bugs in the
38 * open_request handling and moved
39 * most of it into the af independent code.
40 * Added tail drop and some other bugfixes.
41 * Added new listen sematics.
42 * Mike McLagan : Routing by source
43 * Juan Jose Ciarlante: ip_dynaddr bits
44 * Andi Kleen: various fixes.
45 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
46 * Andi Kleen : Fix new listen.
47 * Andi Kleen : Fix accept error reporting.
48 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
49 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
50 * a single port at the same time.
51 */
52
53 #include <linux/config.h>
54
55 #include <linux/types.h>
56 #include <linux/fcntl.h>
57 #include <linux/random.h>
58 #include <linux/cache.h>
59 #include <linux/jhash.h>
60 #include <linux/init.h>
61
62 #include <net/icmp.h>
63 #include <net/tcp.h>
64 #include <net/ipv6.h>
65 #include <net/inet_common.h>
66
67 #include <linux/inet.h>
68 #include <linux/stddef.h>
69 #include <linux/ipsec.h>
70
71 extern int sysctl_ip_dynaddr;
72 extern int sysctl_ip_default_ttl;
73 int sysctl_tcp_tw_reuse = 0;
74 int sysctl_tcp_low_latency = 0;
75
76 /* Check TCP sequence numbers in ICMP packets. */
77 #define ICMP_MIN_LENGTH 8
78
79 /* Socket used for sending RSTs */
80 static struct inode tcp_inode;
81 static struct socket *tcp_socket=&tcp_inode.u.socket_i;
82
83 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
84 struct sk_buff *skb);
85
86 /*
87 * ALL members must be initialised to prevent gcc-2.7.2.3 miscompilation
88 */
89 struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = {
90 __tcp_ehash: NULL,
91 __tcp_bhash: NULL,
92 __tcp_bhash_size: 0,
93 __tcp_ehash_size: 0,
94 __tcp_listening_hash: { NULL, },
95 __tcp_lhash_lock: RW_LOCK_UNLOCKED,
96 __tcp_lhash_users: ATOMIC_INIT(0),
97 __tcp_lhash_wait:
98 __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait),
99 __tcp_portalloc_lock: SPIN_LOCK_UNLOCKED
100 };
101
102 /*
103 * This array holds the first and last local port number.
104 * For high-usage systems, use sysctl to change this to
105 * 32768-61000
106 */
107 int sysctl_local_port_range[2] = { 1024, 4999 };
108 int tcp_port_rover = (1024 - 1);
109
tcp_hashfn(__u32 laddr,__u16 lport,__u32 faddr,__u16 fport)110 static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport,
111 __u32 faddr, __u16 fport)
112 {
113 int h = ((laddr ^ lport) ^ (faddr ^ fport));
114 h ^= h>>16;
115 h ^= h>>8;
116 return h & (tcp_ehash_size - 1);
117 }
118
tcp_sk_hashfn(struct sock * sk)119 static __inline__ int tcp_sk_hashfn(struct sock *sk)
120 {
121 __u32 laddr = sk->rcv_saddr;
122 __u16 lport = sk->num;
123 __u32 faddr = sk->daddr;
124 __u16 fport = sk->dport;
125
126 return tcp_hashfn(laddr, lport, faddr, fport);
127 }
128
129 /* Allocate and initialize a new TCP local port bind bucket.
130 * The bindhash mutex for snum's hash chain must be held here.
131 */
tcp_bucket_create(struct tcp_bind_hashbucket * head,unsigned short snum)132 struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head,
133 unsigned short snum)
134 {
135 struct tcp_bind_bucket *tb;
136
137 tb = kmem_cache_alloc(tcp_bucket_cachep, SLAB_ATOMIC);
138 if(tb != NULL) {
139 tb->port = snum;
140 tb->fastreuse = 0;
141 tb->owners = NULL;
142 if((tb->next = head->chain) != NULL)
143 tb->next->pprev = &tb->next;
144 head->chain = tb;
145 tb->pprev = &head->chain;
146 }
147 return tb;
148 }
149
150 /* Caller must disable local BH processing. */
__tcp_inherit_port(struct sock * sk,struct sock * child)151 static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child)
152 {
153 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(child->num)];
154 struct tcp_bind_bucket *tb;
155
156 spin_lock(&head->lock);
157 tb = (struct tcp_bind_bucket *)sk->prev;
158 if ((child->bind_next = tb->owners) != NULL)
159 tb->owners->bind_pprev = &child->bind_next;
160 tb->owners = child;
161 child->bind_pprev = &tb->owners;
162 child->prev = (struct sock *) tb;
163 spin_unlock(&head->lock);
164 }
165
tcp_inherit_port(struct sock * sk,struct sock * child)166 inline void tcp_inherit_port(struct sock *sk, struct sock *child)
167 {
168 local_bh_disable();
169 __tcp_inherit_port(sk, child);
170 local_bh_enable();
171 }
172
tcp_bind_hash(struct sock * sk,struct tcp_bind_bucket * tb,unsigned short snum)173 static inline void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, unsigned short snum)
174 {
175 sk->num = snum;
176 if ((sk->bind_next = tb->owners) != NULL)
177 tb->owners->bind_pprev = &sk->bind_next;
178 tb->owners = sk;
179 sk->bind_pprev = &tb->owners;
180 sk->prev = (struct sock *) tb;
181 }
182
tcp_bind_conflict(struct sock * sk,struct tcp_bind_bucket * tb)183 static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb)
184 {
185 struct sock *sk2 = tb->owners;
186 int sk_reuse = sk->reuse;
187
188 for( ; sk2 != NULL; sk2 = sk2->bind_next) {
189 if (sk != sk2 &&
190 sk2->reuse <= 1 &&
191 !ipv6_only_sock(sk2) &&
192 (!sk->bound_dev_if ||
193 !sk2->bound_dev_if ||
194 sk->bound_dev_if == sk2->bound_dev_if)) {
195 if (!sk_reuse ||
196 !sk2->reuse ||
197 sk2->state == TCP_LISTEN) {
198 if (!sk2->rcv_saddr ||
199 !sk->rcv_saddr ||
200 (sk2->rcv_saddr == sk->rcv_saddr))
201 break;
202 }
203 }
204 }
205 return sk2 != NULL;
206 }
207
208 /* Obtain a reference to a local port for the given sock,
209 * if snum is zero it means select any available local port.
210 */
tcp_v4_get_port(struct sock * sk,unsigned short snum)211 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
212 {
213 struct tcp_bind_hashbucket *head;
214 struct tcp_bind_bucket *tb;
215 int ret;
216
217 local_bh_disable();
218 if (snum == 0) {
219 int low = sysctl_local_port_range[0];
220 int high = sysctl_local_port_range[1];
221 int remaining = (high - low) + 1;
222 int rover;
223
224 spin_lock(&tcp_portalloc_lock);
225 rover = tcp_port_rover;
226 do { rover++;
227 if ((rover < low) || (rover > high))
228 rover = low;
229 head = &tcp_bhash[tcp_bhashfn(rover)];
230 spin_lock(&head->lock);
231 for (tb = head->chain; tb; tb = tb->next)
232 if (tb->port == rover)
233 goto next;
234 break;
235 next:
236 spin_unlock(&head->lock);
237 } while (--remaining > 0);
238 tcp_port_rover = rover;
239 spin_unlock(&tcp_portalloc_lock);
240
241 /* Exhausted local port range during search? */
242 ret = 1;
243 if (remaining <= 0)
244 goto fail;
245
246 /* OK, here is the one we will use. HEAD is
247 * non-NULL and we hold it's mutex.
248 */
249 snum = rover;
250 tb = NULL;
251 } else {
252 head = &tcp_bhash[tcp_bhashfn(snum)];
253 spin_lock(&head->lock);
254 for (tb = head->chain; tb != NULL; tb = tb->next)
255 if (tb->port == snum)
256 break;
257 }
258 if (tb != NULL && tb->owners != NULL) {
259 if (sk->reuse > 1)
260 goto success;
261 if (tb->fastreuse > 0 && sk->reuse != 0 && sk->state != TCP_LISTEN) {
262 goto success;
263 } else {
264 ret = 1;
265 if (tcp_bind_conflict(sk, tb))
266 goto fail_unlock;
267 }
268 }
269 ret = 1;
270 if (tb == NULL &&
271 (tb = tcp_bucket_create(head, snum)) == NULL)
272 goto fail_unlock;
273 if (tb->owners == NULL) {
274 if (sk->reuse && sk->state != TCP_LISTEN)
275 tb->fastreuse = 1;
276 else
277 tb->fastreuse = 0;
278 } else if (tb->fastreuse &&
279 ((sk->reuse == 0) || (sk->state == TCP_LISTEN)))
280 tb->fastreuse = 0;
281 success:
282 if (sk->prev == NULL)
283 tcp_bind_hash(sk, tb, snum);
284 BUG_TRAP(sk->prev == (struct sock *) tb);
285 ret = 0;
286
287 fail_unlock:
288 spin_unlock(&head->lock);
289 fail:
290 local_bh_enable();
291 return ret;
292 }
293
294 /* Get rid of any references to a local port held by the
295 * given sock.
296 */
__tcp_put_port(struct sock * sk)297 inline void __tcp_put_port(struct sock *sk)
298 {
299 struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(sk->num)];
300 struct tcp_bind_bucket *tb;
301
302 spin_lock(&head->lock);
303 tb = (struct tcp_bind_bucket *) sk->prev;
304 if (sk->bind_next)
305 sk->bind_next->bind_pprev = sk->bind_pprev;
306 *(sk->bind_pprev) = sk->bind_next;
307 sk->prev = NULL;
308 sk->num = 0;
309 if (tb->owners == NULL) {
310 if (tb->next)
311 tb->next->pprev = tb->pprev;
312 *(tb->pprev) = tb->next;
313 kmem_cache_free(tcp_bucket_cachep, tb);
314 }
315 spin_unlock(&head->lock);
316 }
317
tcp_put_port(struct sock * sk)318 void tcp_put_port(struct sock *sk)
319 {
320 local_bh_disable();
321 __tcp_put_port(sk);
322 local_bh_enable();
323 }
324
325 /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP.
326 * Look, when several writers sleep and reader wakes them up, all but one
327 * immediately hit write lock and grab all the cpus. Exclusive sleep solves
328 * this, _but_ remember, it adds useless work on UP machines (wake up each
329 * exclusive lock release). It should be ifdefed really.
330 */
331
tcp_listen_wlock(void)332 void tcp_listen_wlock(void)
333 {
334 write_lock(&tcp_lhash_lock);
335
336 if (atomic_read(&tcp_lhash_users)) {
337 DECLARE_WAITQUEUE(wait, current);
338
339 add_wait_queue_exclusive(&tcp_lhash_wait, &wait);
340 for (;;) {
341 set_current_state(TASK_UNINTERRUPTIBLE);
342 if (atomic_read(&tcp_lhash_users) == 0)
343 break;
344 write_unlock_bh(&tcp_lhash_lock);
345 schedule();
346 write_lock_bh(&tcp_lhash_lock);
347 }
348
349 __set_current_state(TASK_RUNNING);
350 remove_wait_queue(&tcp_lhash_wait, &wait);
351 }
352 }
353
__tcp_v4_hash(struct sock * sk,const int listen_possible)354 static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible)
355 {
356 struct sock **skp;
357 rwlock_t *lock;
358
359 BUG_TRAP(sk->pprev==NULL);
360 if(listen_possible && sk->state == TCP_LISTEN) {
361 skp = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)];
362 lock = &tcp_lhash_lock;
363 tcp_listen_wlock();
364 } else {
365 skp = &tcp_ehash[(sk->hashent = tcp_sk_hashfn(sk))].chain;
366 lock = &tcp_ehash[sk->hashent].lock;
367 write_lock(lock);
368 }
369 if((sk->next = *skp) != NULL)
370 (*skp)->pprev = &sk->next;
371 *skp = sk;
372 sk->pprev = skp;
373 sock_prot_inc_use(sk->prot);
374 write_unlock(lock);
375 if (listen_possible && sk->state == TCP_LISTEN)
376 wake_up(&tcp_lhash_wait);
377 }
378
tcp_v4_hash(struct sock * sk)379 static void tcp_v4_hash(struct sock *sk)
380 {
381 if (sk->state != TCP_CLOSE) {
382 local_bh_disable();
383 __tcp_v4_hash(sk, 1);
384 local_bh_enable();
385 }
386 }
387
tcp_unhash(struct sock * sk)388 void tcp_unhash(struct sock *sk)
389 {
390 rwlock_t *lock;
391
392 if (!sk->pprev)
393 goto ende;
394
395 if (sk->state == TCP_LISTEN) {
396 local_bh_disable();
397 tcp_listen_wlock();
398 lock = &tcp_lhash_lock;
399 } else {
400 struct tcp_ehash_bucket *head = &tcp_ehash[sk->hashent];
401 lock = &head->lock;
402 write_lock_bh(&head->lock);
403 }
404
405 if(sk->pprev) {
406 if(sk->next)
407 sk->next->pprev = sk->pprev;
408 *sk->pprev = sk->next;
409 sk->pprev = NULL;
410 sock_prot_dec_use(sk->prot);
411 }
412 write_unlock_bh(lock);
413
414 ende:
415 if (sk->state == TCP_LISTEN)
416 wake_up(&tcp_lhash_wait);
417 }
418
419 /* Don't inline this cruft. Here are some nice properties to
420 * exploit here. The BSD API does not allow a listening TCP
421 * to specify the remote port nor the remote address for the
422 * connection. So always assume those are both wildcarded
423 * during the search since they can never be otherwise.
424 */
__tcp_v4_lookup_listener(struct sock * sk,u32 daddr,unsigned short hnum,int dif)425 static struct sock *__tcp_v4_lookup_listener(struct sock *sk, u32 daddr, unsigned short hnum, int dif)
426 {
427 struct sock *result = NULL;
428 int score, hiscore;
429
430 hiscore=-1;
431 for(; sk; sk = sk->next) {
432 if(sk->num == hnum && !ipv6_only_sock(sk)) {
433 __u32 rcv_saddr = sk->rcv_saddr;
434
435 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
436 score = sk->family == PF_INET ? 1 : 0;
437 #else
438 score = 1;
439 #endif
440 if(rcv_saddr) {
441 if (rcv_saddr != daddr)
442 continue;
443 score+=2;
444 }
445 if (sk->bound_dev_if) {
446 if (sk->bound_dev_if != dif)
447 continue;
448 score+=2;
449 }
450 if (score == 5)
451 return sk;
452 if (score > hiscore) {
453 hiscore = score;
454 result = sk;
455 }
456 }
457 }
458 return result;
459 }
460
461 /* Optimize the common listener case. */
tcp_v4_lookup_listener(u32 daddr,unsigned short hnum,int dif)462 inline struct sock *tcp_v4_lookup_listener(u32 daddr, unsigned short hnum, int dif)
463 {
464 struct sock *sk;
465
466 read_lock(&tcp_lhash_lock);
467 sk = tcp_listening_hash[tcp_lhashfn(hnum)];
468 if (sk) {
469 if (sk->num == hnum &&
470 sk->next == NULL &&
471 (!sk->rcv_saddr || sk->rcv_saddr == daddr) &&
472 (sk->family == PF_INET || !ipv6_only_sock(sk)) &&
473 !sk->bound_dev_if)
474 goto sherry_cache;
475 sk = __tcp_v4_lookup_listener(sk, daddr, hnum, dif);
476 }
477 if (sk) {
478 sherry_cache:
479 sock_hold(sk);
480 }
481 read_unlock(&tcp_lhash_lock);
482 return sk;
483 }
484
485 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
486 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
487 *
488 * Local BH must be disabled here.
489 */
490
__tcp_v4_lookup_established(u32 saddr,u16 sport,u32 daddr,u16 hnum,int dif)491 static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport,
492 u32 daddr, u16 hnum, int dif)
493 {
494 struct tcp_ehash_bucket *head;
495 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
496 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
497 struct sock *sk;
498 int hash;
499
500 /* Optimize here for direct hit, only listening connections can
501 * have wildcards anyways.
502 */
503 hash = tcp_hashfn(daddr, hnum, saddr, sport);
504 head = &tcp_ehash[hash];
505 read_lock(&head->lock);
506 for(sk = head->chain; sk; sk = sk->next) {
507 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
508 goto hit; /* You sunk my battleship! */
509 }
510
511 /* Must check for a TIME_WAIT'er before going to listener hash. */
512 for(sk = (head + tcp_ehash_size)->chain; sk; sk = sk->next)
513 if(TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
514 goto hit;
515 read_unlock(&head->lock);
516
517 return NULL;
518
519 hit:
520 sock_hold(sk);
521 read_unlock(&head->lock);
522 return sk;
523 }
524
__tcp_v4_lookup(u32 saddr,u16 sport,u32 daddr,u16 hnum,int dif)525 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
526 u32 daddr, u16 hnum, int dif)
527 {
528 struct sock *sk;
529
530 sk = __tcp_v4_lookup_established(saddr, sport, daddr, hnum, dif);
531
532 if (sk)
533 return sk;
534
535 return tcp_v4_lookup_listener(daddr, hnum, dif);
536 }
537
tcp_v4_lookup(u32 saddr,u16 sport,u32 daddr,u16 dport,int dif)538 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif)
539 {
540 struct sock *sk;
541
542 local_bh_disable();
543 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
544 local_bh_enable();
545
546 return sk;
547 }
548
tcp_v4_init_sequence(struct sock * sk,struct sk_buff * skb)549 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
550 {
551 return secure_tcp_sequence_number(skb->nh.iph->daddr,
552 skb->nh.iph->saddr,
553 skb->h.th->dest,
554 skb->h.th->source);
555 }
556
557 /* called with local bh disabled */
__tcp_v4_check_established(struct sock * sk,__u16 lport,struct tcp_tw_bucket ** twp)558 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
559 struct tcp_tw_bucket **twp)
560 {
561 u32 daddr = sk->rcv_saddr;
562 u32 saddr = sk->daddr;
563 int dif = sk->bound_dev_if;
564 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
565 __u32 ports = TCP_COMBINED_PORTS(sk->dport, lport);
566 int hash = tcp_hashfn(daddr, lport, saddr, sk->dport);
567 struct tcp_ehash_bucket *head = &tcp_ehash[hash];
568 struct sock *sk2, **skp;
569 struct tcp_tw_bucket *tw;
570
571 write_lock(&head->lock);
572
573 /* Check TIME-WAIT sockets first. */
574 for(skp = &(head + tcp_ehash_size)->chain; (sk2=*skp) != NULL;
575 skp = &sk2->next) {
576 tw = (struct tcp_tw_bucket*)sk2;
577
578 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
579 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
580
581 /* With PAWS, it is safe from the viewpoint
582 of data integrity. Even without PAWS it
583 is safe provided sequence spaces do not
584 overlap i.e. at data rates <= 80Mbit/sec.
585
586 Actually, the idea is close to VJ's one,
587 only timestamp cache is held not per host,
588 but per port pair and TW bucket is used
589 as state holder.
590
591 If TW bucket has been already destroyed we
592 fall back to VJ's scheme and use initial
593 timestamp retrieved from peer table.
594 */
595 if (tw->ts_recent_stamp &&
596 (!twp || (sysctl_tcp_tw_reuse &&
597 xtime.tv_sec - tw->ts_recent_stamp > 1))) {
598 if ((tp->write_seq = tw->snd_nxt+65535+2) == 0)
599 tp->write_seq = 1;
600 tp->ts_recent = tw->ts_recent;
601 tp->ts_recent_stamp = tw->ts_recent_stamp;
602 sock_hold(sk2);
603 skp = &head->chain;
604 goto unique;
605 } else
606 goto not_unique;
607 }
608 }
609 tw = NULL;
610
611 /* And established part... */
612 for(skp = &head->chain; (sk2=*skp)!=NULL; skp = &sk2->next) {
613 if(TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
614 goto not_unique;
615 }
616
617 unique:
618 /* Must record num and sport now. Otherwise we will see
619 * in hash table socket with a funny identity. */
620 sk->num = lport;
621 sk->sport = htons(lport);
622 BUG_TRAP(sk->pprev==NULL);
623 if ((sk->next = *skp) != NULL)
624 (*skp)->pprev = &sk->next;
625
626 *skp = sk;
627 sk->pprev = skp;
628 sk->hashent = hash;
629 sock_prot_inc_use(sk->prot);
630 write_unlock(&head->lock);
631
632 if (twp) {
633 *twp = tw;
634 NET_INC_STATS_BH(TimeWaitRecycled);
635 } else if (tw) {
636 /* Silly. Should hash-dance instead... */
637 tcp_tw_deschedule(tw);
638 tcp_timewait_kill(tw);
639 NET_INC_STATS_BH(TimeWaitRecycled);
640
641 tcp_tw_put(tw);
642 }
643
644 return 0;
645
646 not_unique:
647 write_unlock(&head->lock);
648 return -EADDRNOTAVAIL;
649 }
650
651 /*
652 * Bind a port for a connect operation and hash it.
653 */
tcp_v4_hash_connect(struct sock * sk)654 static int tcp_v4_hash_connect(struct sock *sk)
655 {
656 unsigned short snum = sk->num;
657 struct tcp_bind_hashbucket *head;
658 struct tcp_bind_bucket *tb;
659
660 if (snum == 0) {
661 int rover;
662 int low = sysctl_local_port_range[0];
663 int high = sysctl_local_port_range[1];
664 int remaining = (high - low) + 1;
665 struct tcp_tw_bucket *tw = NULL;
666
667 local_bh_disable();
668
669 /* TODO. Actually it is not so bad idea to remove
670 * tcp_portalloc_lock before next submission to Linus.
671 * As soon as we touch this place at all it is time to think.
672 *
673 * Now it protects single _advisory_ variable tcp_port_rover,
674 * hence it is mostly useless.
675 * Code will work nicely if we just delete it, but
676 * I am afraid in contented case it will work not better or
677 * even worse: another cpu just will hit the same bucket
678 * and spin there.
679 * So some cpu salt could remove both contention and
680 * memory pingpong. Any ideas how to do this in a nice way?
681 */
682 spin_lock(&tcp_portalloc_lock);
683 rover = tcp_port_rover;
684
685 do {
686 rover++;
687 if ((rover < low) || (rover > high))
688 rover = low;
689 head = &tcp_bhash[tcp_bhashfn(rover)];
690 spin_lock(&head->lock);
691
692 /* Does not bother with rcv_saddr checks,
693 * because the established check is already
694 * unique enough.
695 */
696 for (tb = head->chain; tb; tb = tb->next) {
697 if (tb->port == rover) {
698 BUG_TRAP(tb->owners != NULL);
699 if (tb->fastreuse >= 0)
700 goto next_port;
701 if (!__tcp_v4_check_established(sk, rover, &tw))
702 goto ok;
703 goto next_port;
704 }
705 }
706
707 tb = tcp_bucket_create(head, rover);
708 if (!tb) {
709 spin_unlock(&head->lock);
710 break;
711 }
712 tb->fastreuse = -1;
713 goto ok;
714
715 next_port:
716 spin_unlock(&head->lock);
717 } while (--remaining > 0);
718 tcp_port_rover = rover;
719 spin_unlock(&tcp_portalloc_lock);
720
721 local_bh_enable();
722
723 return -EADDRNOTAVAIL;
724
725 ok:
726 /* All locks still held and bhs disabled */
727 tcp_port_rover = rover;
728 spin_unlock(&tcp_portalloc_lock);
729
730 tcp_bind_hash(sk, tb, rover);
731 if (!sk->pprev) {
732 sk->sport = htons(rover);
733 __tcp_v4_hash(sk, 0);
734 }
735 spin_unlock(&head->lock);
736
737 if (tw) {
738 tcp_tw_deschedule(tw);
739 tcp_timewait_kill(tw);
740 tcp_tw_put(tw);
741 }
742
743 local_bh_enable();
744 return 0;
745 }
746
747 head = &tcp_bhash[tcp_bhashfn(snum)];
748 tb = (struct tcp_bind_bucket *)sk->prev;
749 spin_lock_bh(&head->lock);
750 if (tb->owners == sk && sk->bind_next == NULL) {
751 __tcp_v4_hash(sk, 0);
752 spin_unlock_bh(&head->lock);
753 return 0;
754 } else {
755 int ret;
756 spin_unlock(&head->lock);
757 /* No definite answer... Walk to established hash table */
758 ret = __tcp_v4_check_established(sk, snum, NULL);
759 local_bh_enable();
760 return ret;
761 }
762 }
763
764 /* This will initiate an outgoing connection. */
tcp_v4_connect(struct sock * sk,struct sockaddr * uaddr,int addr_len)765 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
766 {
767 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
768 struct sockaddr_in *usin = (struct sockaddr_in *) uaddr;
769 struct rtable *rt;
770 u32 daddr, nexthop;
771 int tmp;
772 int err;
773
774 if (addr_len < sizeof(struct sockaddr_in))
775 return(-EINVAL);
776
777 if (usin->sin_family != AF_INET)
778 return(-EAFNOSUPPORT);
779
780 nexthop = daddr = usin->sin_addr.s_addr;
781 if (sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) {
782 if (daddr == 0)
783 return -EINVAL;
784 nexthop = sk->protinfo.af_inet.opt->faddr;
785 }
786
787 tmp = ip_route_connect(&rt, nexthop, sk->saddr,
788 RT_CONN_FLAGS(sk), sk->bound_dev_if);
789 if (tmp < 0)
790 return tmp;
791
792 if (rt->rt_flags&(RTCF_MULTICAST|RTCF_BROADCAST)) {
793 ip_rt_put(rt);
794 return -ENETUNREACH;
795 }
796
797 __sk_dst_set(sk, &rt->u.dst);
798 sk->route_caps = rt->u.dst.dev->features;
799
800 if (!sk->protinfo.af_inet.opt || !sk->protinfo.af_inet.opt->srr)
801 daddr = rt->rt_dst;
802
803 if (!sk->saddr)
804 sk->saddr = rt->rt_src;
805 sk->rcv_saddr = sk->saddr;
806
807 if (tp->ts_recent_stamp && sk->daddr != daddr) {
808 /* Reset inherited state */
809 tp->ts_recent = 0;
810 tp->ts_recent_stamp = 0;
811 tp->write_seq = 0;
812 }
813
814 if (sysctl_tcp_tw_recycle &&
815 !tp->ts_recent_stamp &&
816 rt->rt_dst == daddr) {
817 struct inet_peer *peer = rt_get_peer(rt);
818
819 /* VJ's idea. We save last timestamp seen from
820 * the destination in peer table, when entering state TIME-WAIT
821 * and initialize ts_recent from it, when trying new connection.
822 */
823
824 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
825 tp->ts_recent_stamp = peer->tcp_ts_stamp;
826 tp->ts_recent = peer->tcp_ts;
827 }
828 }
829
830 sk->dport = usin->sin_port;
831 sk->daddr = daddr;
832
833 tp->ext_header_len = 0;
834 if (sk->protinfo.af_inet.opt)
835 tp->ext_header_len = sk->protinfo.af_inet.opt->optlen;
836
837 tp->mss_clamp = 536;
838
839 /* Socket identity is still unknown (sport may be zero).
840 * However we set state to SYN-SENT and not releasing socket
841 * lock select source port, enter ourselves into the hash tables and
842 * complete initalization after this.
843 */
844 tcp_set_state(sk, TCP_SYN_SENT);
845 err = tcp_v4_hash_connect(sk);
846 if (err)
847 goto failure;
848
849 if (!tp->write_seq)
850 tp->write_seq = secure_tcp_sequence_number(sk->saddr, sk->daddr,
851 sk->sport, usin->sin_port);
852
853 sk->protinfo.af_inet.id = tp->write_seq^jiffies;
854
855 err = tcp_connect(sk);
856 if (err)
857 goto failure;
858
859 return 0;
860
861 failure:
862 tcp_set_state(sk, TCP_CLOSE);
863 __sk_dst_reset(sk);
864 sk->route_caps = 0;
865 sk->dport = 0;
866 return err;
867 }
868
tcp_v4_iif(struct sk_buff * skb)869 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
870 {
871 return ((struct rtable*)skb->dst)->rt_iif;
872 }
873
tcp_v4_synq_hash(u32 raddr,u16 rport,u32 rnd)874 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
875 {
876 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
877 }
878
tcp_v4_search_req(struct tcp_opt * tp,struct open_request *** prevp,__u16 rport,__u32 raddr,__u32 laddr)879 static struct open_request *tcp_v4_search_req(struct tcp_opt *tp,
880 struct open_request ***prevp,
881 __u16 rport,
882 __u32 raddr, __u32 laddr)
883 {
884 struct tcp_listen_opt *lopt = tp->listen_opt;
885 struct open_request *req, **prev;
886
887 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
888 (req = *prev) != NULL;
889 prev = &req->dl_next) {
890 if (req->rmt_port == rport &&
891 req->af.v4_req.rmt_addr == raddr &&
892 req->af.v4_req.loc_addr == laddr &&
893 TCP_INET_FAMILY(req->class->family)) {
894 BUG_TRAP(req->sk == NULL);
895 *prevp = prev;
896 return req;
897 }
898 }
899
900 return NULL;
901 }
902
tcp_v4_synq_add(struct sock * sk,struct open_request * req)903 static void tcp_v4_synq_add(struct sock *sk, struct open_request *req)
904 {
905 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
906 struct tcp_listen_opt *lopt = tp->listen_opt;
907 u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd);
908
909 req->expires = jiffies + TCP_TIMEOUT_INIT;
910 req->retrans = 0;
911 req->sk = NULL;
912 req->dl_next = lopt->syn_table[h];
913
914 write_lock(&tp->syn_wait_lock);
915 lopt->syn_table[h] = req;
916 write_unlock(&tp->syn_wait_lock);
917
918 tcp_synq_added(sk);
919 }
920
921
922 /*
923 * This routine does path mtu discovery as defined in RFC1191.
924 */
do_pmtu_discovery(struct sock * sk,struct iphdr * ip,unsigned mtu)925 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *ip, unsigned mtu)
926 {
927 struct dst_entry *dst;
928 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
929
930 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
931 * send out by Linux are always <576bytes so they should go through
932 * unfragmented).
933 */
934 if (sk->state == TCP_LISTEN)
935 return;
936
937 /* We don't check in the destentry if pmtu discovery is forbidden
938 * on this route. We just assume that no packet_to_big packets
939 * are send back when pmtu discovery is not active.
940 * There is a small race when the user changes this flag in the
941 * route, but I think that's acceptable.
942 */
943 if ((dst = __sk_dst_check(sk, 0)) == NULL)
944 return;
945
946 ip_rt_update_pmtu(dst, mtu);
947
948 /* Something is about to be wrong... Remember soft error
949 * for the case, if this connection will not able to recover.
950 */
951 if (mtu < dst->pmtu && ip_dont_fragment(sk, dst))
952 sk->err_soft = EMSGSIZE;
953
954 if (sk->protinfo.af_inet.pmtudisc != IP_PMTUDISC_DONT &&
955 tp->pmtu_cookie > dst->pmtu) {
956 tcp_sync_mss(sk, dst->pmtu);
957
958 /* Resend the TCP packet because it's
959 * clear that the old packet has been
960 * dropped. This is the new "fast" path mtu
961 * discovery.
962 */
963 tcp_simple_retransmit(sk);
964 } /* else let the usual retransmit timer handle it */
965 }
966
967 /*
968 * This routine is called by the ICMP module when it gets some
969 * sort of error condition. If err < 0 then the socket should
970 * be closed and the error returned to the user. If err > 0
971 * it's just the icmp type << 8 | icmp code. After adjustment
972 * header points to the first 8 bytes of the tcp header. We need
973 * to find the appropriate port.
974 *
975 * The locking strategy used here is very "optimistic". When
976 * someone else accesses the socket the ICMP is just dropped
977 * and for some paths there is no check at all.
978 * A more general error queue to queue errors for later handling
979 * is probably better.
980 *
981 */
982
tcp_v4_err(struct sk_buff * skb,u32 info)983 void tcp_v4_err(struct sk_buff *skb, u32 info)
984 {
985 struct iphdr *iph = (struct iphdr*)skb->data;
986 struct tcphdr *th = (struct tcphdr*)(skb->data+(iph->ihl<<2));
987 struct tcp_opt *tp;
988 int type = skb->h.icmph->type;
989 int code = skb->h.icmph->code;
990 struct sock *sk;
991 __u32 seq;
992 int err;
993
994 if (skb->len < (iph->ihl << 2) + 8) {
995 ICMP_INC_STATS_BH(IcmpInErrors);
996 return;
997 }
998
999 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, th->source, tcp_v4_iif(skb));
1000 if (sk == NULL) {
1001 ICMP_INC_STATS_BH(IcmpInErrors);
1002 return;
1003 }
1004 if (sk->state == TCP_TIME_WAIT) {
1005 tcp_tw_put((struct tcp_tw_bucket*)sk);
1006 return;
1007 }
1008
1009 bh_lock_sock(sk);
1010 /* If too many ICMPs get dropped on busy
1011 * servers this needs to be solved differently.
1012 */
1013 if (sk->lock.users != 0)
1014 NET_INC_STATS_BH(LockDroppedIcmps);
1015
1016 if (sk->state == TCP_CLOSE)
1017 goto out;
1018
1019 tp = &sk->tp_pinfo.af_tcp;
1020 seq = ntohl(th->seq);
1021 if (sk->state != TCP_LISTEN && !between(seq, tp->snd_una, tp->snd_nxt)) {
1022 NET_INC_STATS(OutOfWindowIcmps);
1023 goto out;
1024 }
1025
1026 switch (type) {
1027 case ICMP_SOURCE_QUENCH:
1028 /* Just silently ignore these. */
1029 goto out;
1030 case ICMP_PARAMETERPROB:
1031 err = EPROTO;
1032 break;
1033 case ICMP_DEST_UNREACH:
1034 if (code > NR_ICMP_UNREACH)
1035 goto out;
1036
1037 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
1038 if (sk->lock.users == 0)
1039 do_pmtu_discovery(sk, iph, info);
1040 goto out;
1041 }
1042
1043 err = icmp_err_convert[code].errno;
1044 break;
1045 case ICMP_TIME_EXCEEDED:
1046 err = EHOSTUNREACH;
1047 break;
1048 default:
1049 goto out;
1050 }
1051
1052 switch (sk->state) {
1053 struct open_request *req, **prev;
1054 case TCP_LISTEN:
1055 if (sk->lock.users != 0)
1056 goto out;
1057
1058 req = tcp_v4_search_req(tp, &prev,
1059 th->dest,
1060 iph->daddr, iph->saddr);
1061 if (!req)
1062 goto out;
1063
1064 /* ICMPs are not backlogged, hence we cannot get
1065 an established socket here.
1066 */
1067 BUG_TRAP(req->sk == NULL);
1068
1069 if (seq != req->snt_isn) {
1070 NET_INC_STATS_BH(OutOfWindowIcmps);
1071 goto out;
1072 }
1073
1074 /*
1075 * Still in SYN_RECV, just remove it silently.
1076 * There is no good way to pass the error to the newly
1077 * created socket, and POSIX does not want network
1078 * errors returned from accept().
1079 */
1080 tcp_synq_drop(sk, req, prev);
1081 goto out;
1082
1083 case TCP_SYN_SENT:
1084 case TCP_SYN_RECV: /* Cannot happen.
1085 It can f.e. if SYNs crossed.
1086 */
1087 if (sk->lock.users == 0) {
1088 TCP_INC_STATS_BH(TcpAttemptFails);
1089 sk->err = err;
1090
1091 sk->error_report(sk);
1092
1093 tcp_done(sk);
1094 } else {
1095 sk->err_soft = err;
1096 }
1097 goto out;
1098 }
1099
1100 /* If we've already connected we will keep trying
1101 * until we time out, or the user gives up.
1102 *
1103 * rfc1122 4.2.3.9 allows to consider as hard errors
1104 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
1105 * but it is obsoleted by pmtu discovery).
1106 *
1107 * Note, that in modern internet, where routing is unreliable
1108 * and in each dark corner broken firewalls sit, sending random
1109 * errors ordered by their masters even this two messages finally lose
1110 * their original sense (even Linux sends invalid PORT_UNREACHs)
1111 *
1112 * Now we are in compliance with RFCs.
1113 * --ANK (980905)
1114 */
1115
1116 if (sk->lock.users == 0 && sk->protinfo.af_inet.recverr) {
1117 sk->err = err;
1118 sk->error_report(sk);
1119 } else { /* Only an error on timeout */
1120 sk->err_soft = err;
1121 }
1122
1123 out:
1124 bh_unlock_sock(sk);
1125 sock_put(sk);
1126 }
1127
1128 /* This routine computes an IPv4 TCP checksum. */
tcp_v4_send_check(struct sock * sk,struct tcphdr * th,int len,struct sk_buff * skb)1129 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
1130 struct sk_buff *skb)
1131 {
1132 if (skb->ip_summed == CHECKSUM_HW) {
1133 th->check = ~tcp_v4_check(th, len, sk->saddr, sk->daddr, 0);
1134 skb->csum = offsetof(struct tcphdr, check);
1135 } else {
1136 th->check = tcp_v4_check(th, len, sk->saddr, sk->daddr,
1137 csum_partial((char *)th, th->doff<<2, skb->csum));
1138 }
1139 }
1140
1141 /*
1142 * This routine will send an RST to the other tcp.
1143 *
1144 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
1145 * for reset.
1146 * Answer: if a packet caused RST, it is not for a socket
1147 * existing in our system, if it is matched to a socket,
1148 * it is just duplicate segment or bug in other side's TCP.
1149 * So that we build reply only basing on parameters
1150 * arrived with segment.
1151 * Exception: precedence violation. We do not implement it in any case.
1152 */
1153
tcp_v4_send_reset(struct sk_buff * skb)1154 static void tcp_v4_send_reset(struct sk_buff *skb)
1155 {
1156 struct tcphdr *th = skb->h.th;
1157 struct tcphdr rth;
1158 struct ip_reply_arg arg;
1159
1160 /* Never send a reset in response to a reset. */
1161 if (th->rst)
1162 return;
1163
1164 if (((struct rtable*)skb->dst)->rt_type != RTN_LOCAL)
1165 return;
1166
1167 /* Swap the send and the receive. */
1168 memset(&rth, 0, sizeof(struct tcphdr));
1169 rth.dest = th->source;
1170 rth.source = th->dest;
1171 rth.doff = sizeof(struct tcphdr)/4;
1172 rth.rst = 1;
1173
1174 if (th->ack) {
1175 rth.seq = th->ack_seq;
1176 } else {
1177 rth.ack = 1;
1178 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin
1179 + skb->len - (th->doff<<2));
1180 }
1181
1182 memset(&arg, 0, sizeof arg);
1183 arg.iov[0].iov_base = (unsigned char *)&rth;
1184 arg.iov[0].iov_len = sizeof rth;
1185 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1186 skb->nh.iph->saddr, /*XXX*/
1187 sizeof(struct tcphdr),
1188 IPPROTO_TCP,
1189 0);
1190 arg.n_iov = 1;
1191 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1192
1193 tcp_socket->sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl;
1194 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
1195
1196 TCP_INC_STATS_BH(TcpOutSegs);
1197 TCP_INC_STATS_BH(TcpOutRsts);
1198 }
1199
1200 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
1201 outside socket context is ugly, certainly. What can I do?
1202 */
1203
tcp_v4_send_ack(struct sk_buff * skb,u32 seq,u32 ack,u32 win,u32 ts)1204 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts)
1205 {
1206 struct tcphdr *th = skb->h.th;
1207 struct {
1208 struct tcphdr th;
1209 u32 tsopt[3];
1210 } rep;
1211 struct ip_reply_arg arg;
1212
1213 memset(&rep.th, 0, sizeof(struct tcphdr));
1214 memset(&arg, 0, sizeof arg);
1215
1216 arg.iov[0].iov_base = (unsigned char *)&rep;
1217 arg.iov[0].iov_len = sizeof(rep.th);
1218 arg.n_iov = 1;
1219 if (ts) {
1220 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) |
1221 (TCPOPT_NOP << 16) |
1222 (TCPOPT_TIMESTAMP << 8) |
1223 TCPOLEN_TIMESTAMP);
1224 rep.tsopt[1] = htonl(tcp_time_stamp);
1225 rep.tsopt[2] = htonl(ts);
1226 arg.iov[0].iov_len = sizeof(rep);
1227 }
1228
1229 /* Swap the send and the receive. */
1230 rep.th.dest = th->source;
1231 rep.th.source = th->dest;
1232 rep.th.doff = arg.iov[0].iov_len/4;
1233 rep.th.seq = htonl(seq);
1234 rep.th.ack_seq = htonl(ack);
1235 rep.th.ack = 1;
1236 rep.th.window = htons(win);
1237
1238 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
1239 skb->nh.iph->saddr, /*XXX*/
1240 arg.iov[0].iov_len,
1241 IPPROTO_TCP,
1242 0);
1243 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
1244
1245 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1246
1247 TCP_INC_STATS_BH(TcpOutSegs);
1248 }
1249
tcp_v4_timewait_ack(struct sock * sk,struct sk_buff * skb)1250 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1251 {
1252 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1253
1254 tcp_v4_send_ack(skb, tw->snd_nxt, tw->rcv_nxt,
1255 tw->rcv_wnd>>tw->rcv_wscale, tw->ts_recent);
1256
1257 tcp_tw_put(tw);
1258 }
1259
tcp_v4_or_send_ack(struct sk_buff * skb,struct open_request * req)1260 static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req)
1261 {
1262 tcp_v4_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd,
1263 req->ts_recent);
1264 }
1265
tcp_v4_route_req(struct sock * sk,struct open_request * req)1266 static struct dst_entry* tcp_v4_route_req(struct sock *sk, struct open_request *req)
1267 {
1268 struct rtable *rt;
1269 struct ip_options *opt;
1270
1271 opt = req->af.v4_req.opt;
1272 if(ip_route_output(&rt, ((opt && opt->srr) ?
1273 opt->faddr :
1274 req->af.v4_req.rmt_addr),
1275 req->af.v4_req.loc_addr,
1276 RT_CONN_FLAGS(sk), sk->bound_dev_if)) {
1277 IP_INC_STATS_BH(IpOutNoRoutes);
1278 return NULL;
1279 }
1280 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1281 ip_rt_put(rt);
1282 IP_INC_STATS_BH(IpOutNoRoutes);
1283 return NULL;
1284 }
1285 return &rt->u.dst;
1286 }
1287
1288 /*
1289 * Send a SYN-ACK after having received an ACK.
1290 * This still operates on a open_request only, not on a big
1291 * socket.
1292 */
tcp_v4_send_synack(struct sock * sk,struct open_request * req,struct dst_entry * dst)1293 static int tcp_v4_send_synack(struct sock *sk, struct open_request *req,
1294 struct dst_entry *dst)
1295 {
1296 int err = -1;
1297 struct sk_buff * skb;
1298
1299 /* First, grab a route. */
1300 if (dst == NULL &&
1301 (dst = tcp_v4_route_req(sk, req)) == NULL)
1302 goto out;
1303
1304 skb = tcp_make_synack(sk, dst, req);
1305
1306 if (skb) {
1307 struct tcphdr *th = skb->h.th;
1308
1309 th->check = tcp_v4_check(th, skb->len,
1310 req->af.v4_req.loc_addr, req->af.v4_req.rmt_addr,
1311 csum_partial((char *)th, skb->len, skb->csum));
1312
1313 err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr,
1314 req->af.v4_req.rmt_addr, req->af.v4_req.opt);
1315 if (err == NET_XMIT_CN)
1316 err = 0;
1317 }
1318
1319 out:
1320 dst_release(dst);
1321 return err;
1322 }
1323
1324 /*
1325 * IPv4 open_request destructor.
1326 */
tcp_v4_or_free(struct open_request * req)1327 static void tcp_v4_or_free(struct open_request *req)
1328 {
1329 if (req->af.v4_req.opt)
1330 kfree(req->af.v4_req.opt);
1331 }
1332
syn_flood_warning(struct sk_buff * skb)1333 static inline void syn_flood_warning(struct sk_buff *skb)
1334 {
1335 static unsigned long warntime;
1336
1337 if (jiffies - warntime > HZ*60) {
1338 warntime = jiffies;
1339 printk(KERN_INFO
1340 "possible SYN flooding on port %d. Sending cookies.\n",
1341 ntohs(skb->h.th->dest));
1342 }
1343 }
1344
1345 /*
1346 * Save and compile IPv4 options into the open_request if needed.
1347 */
1348 static inline struct ip_options *
tcp_v4_save_options(struct sock * sk,struct sk_buff * skb)1349 tcp_v4_save_options(struct sock *sk, struct sk_buff *skb)
1350 {
1351 struct ip_options *opt = &(IPCB(skb)->opt);
1352 struct ip_options *dopt = NULL;
1353
1354 if (opt && opt->optlen) {
1355 int opt_size = optlength(opt);
1356 dopt = kmalloc(opt_size, GFP_ATOMIC);
1357 if (dopt) {
1358 if (ip_options_echo(dopt, skb)) {
1359 kfree(dopt);
1360 dopt = NULL;
1361 }
1362 }
1363 }
1364 return dopt;
1365 }
1366
1367 /*
1368 * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
1369 * One SYN_RECV socket costs about 80bytes on a 32bit machine.
1370 * It would be better to replace it with a global counter for all sockets
1371 * but then some measure against one socket starving all other sockets
1372 * would be needed.
1373 *
1374 * It was 128 by default. Experiments with real servers show, that
1375 * it is absolutely not enough even at 100conn/sec. 256 cures most
1376 * of problems. This value is adjusted to 128 for very small machines
1377 * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
1378 * Further increasing requires to change hash table size.
1379 */
1380 int sysctl_max_syn_backlog = 256;
1381
1382 struct or_calltable or_ipv4 = {
1383 PF_INET,
1384 tcp_v4_send_synack,
1385 tcp_v4_or_send_ack,
1386 tcp_v4_or_free,
1387 tcp_v4_send_reset
1388 };
1389
tcp_v4_conn_request(struct sock * sk,struct sk_buff * skb)1390 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1391 {
1392 struct tcp_opt tp;
1393 struct open_request *req;
1394 __u32 saddr = skb->nh.iph->saddr;
1395 __u32 daddr = skb->nh.iph->daddr;
1396 __u32 isn = TCP_SKB_CB(skb)->when;
1397 struct dst_entry *dst = NULL;
1398 #ifdef CONFIG_SYN_COOKIES
1399 int want_cookie = 0;
1400 #else
1401 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1402 #endif
1403
1404 /* Never answer to SYNs send to broadcast or multicast */
1405 if (((struct rtable *)skb->dst)->rt_flags &
1406 (RTCF_BROADCAST|RTCF_MULTICAST))
1407 goto drop;
1408
1409 /* TW buckets are converted to open requests without
1410 * limitations, they conserve resources and peer is
1411 * evidently real one.
1412 */
1413 if (tcp_synq_is_full(sk) && !isn) {
1414 #ifdef CONFIG_SYN_COOKIES
1415 if (sysctl_tcp_syncookies) {
1416 want_cookie = 1;
1417 } else
1418 #endif
1419 goto drop;
1420 }
1421
1422 /* Accept backlog is full. If we have already queued enough
1423 * of warm entries in syn queue, drop request. It is better than
1424 * clogging syn queue with openreqs with exponentially increasing
1425 * timeout.
1426 */
1427 if (tcp_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1428 goto drop;
1429
1430 req = tcp_openreq_alloc();
1431 if (req == NULL)
1432 goto drop;
1433
1434 tcp_clear_options(&tp);
1435 tp.mss_clamp = 536;
1436 tp.user_mss = sk->tp_pinfo.af_tcp.user_mss;
1437
1438 tcp_parse_options(skb, &tp, 0);
1439
1440 if (want_cookie) {
1441 tcp_clear_options(&tp);
1442 tp.saw_tstamp = 0;
1443 }
1444
1445 if (tp.saw_tstamp && tp.rcv_tsval == 0) {
1446 /* Some OSes (unknown ones, but I see them on web server, which
1447 * contains information interesting only for windows'
1448 * users) do not send their stamp in SYN. It is easy case.
1449 * We simply do not advertise TS support.
1450 */
1451 tp.saw_tstamp = 0;
1452 tp.tstamp_ok = 0;
1453 }
1454 tp.tstamp_ok = tp.saw_tstamp;
1455
1456 tcp_openreq_init(req, &tp, skb);
1457
1458 req->af.v4_req.loc_addr = daddr;
1459 req->af.v4_req.rmt_addr = saddr;
1460 req->af.v4_req.opt = tcp_v4_save_options(sk, skb);
1461 req->class = &or_ipv4;
1462 if (!want_cookie)
1463 TCP_ECN_create_request(req, skb->h.th);
1464
1465 if (want_cookie) {
1466 #ifdef CONFIG_SYN_COOKIES
1467 syn_flood_warning(skb);
1468 #endif
1469 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1470 } else if (isn == 0) {
1471 struct inet_peer *peer = NULL;
1472
1473 /* VJ's idea. We save last timestamp seen
1474 * from the destination in peer table, when entering
1475 * state TIME-WAIT, and check against it before
1476 * accepting new connection request.
1477 *
1478 * If "isn" is not zero, this request hit alive
1479 * timewait bucket, so that all the necessary checks
1480 * are made in the function processing timewait state.
1481 */
1482 if (tp.saw_tstamp &&
1483 sysctl_tcp_tw_recycle &&
1484 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1485 (peer = rt_get_peer((struct rtable*)dst)) != NULL &&
1486 peer->v4daddr == saddr) {
1487 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1488 (s32)(peer->tcp_ts - req->ts_recent) > TCP_PAWS_WINDOW) {
1489 NET_INC_STATS_BH(PAWSPassiveRejected);
1490 dst_release(dst);
1491 goto drop_and_free;
1492 }
1493 }
1494 /* Kill the following clause, if you dislike this way. */
1495 else if (!sysctl_tcp_syncookies &&
1496 (sysctl_max_syn_backlog - tcp_synq_len(sk)
1497 < (sysctl_max_syn_backlog>>2)) &&
1498 (!peer || !peer->tcp_ts_stamp) &&
1499 (!dst || !dst->rtt)) {
1500 /* Without syncookies last quarter of
1501 * backlog is filled with destinations, proven to be alive.
1502 * It means that we continue to communicate
1503 * to destinations, already remembered
1504 * to the moment of synflood.
1505 */
1506 NETDEBUG(if (net_ratelimit()) \
1507 printk(KERN_DEBUG "TCP: drop open request from %u.%u.%u.%u/%u\n", \
1508 NIPQUAD(saddr), ntohs(skb->h.th->source)));
1509 dst_release(dst);
1510 goto drop_and_free;
1511 }
1512
1513 isn = tcp_v4_init_sequence(sk, skb);
1514 }
1515 req->snt_isn = isn;
1516
1517 if (tcp_v4_send_synack(sk, req, dst))
1518 goto drop_and_free;
1519
1520 if (want_cookie) {
1521 tcp_openreq_free(req);
1522 } else {
1523 tcp_v4_synq_add(sk, req);
1524 }
1525 return 0;
1526
1527 drop_and_free:
1528 tcp_openreq_free(req);
1529 drop:
1530 TCP_INC_STATS_BH(TcpAttemptFails);
1531 return 0;
1532 }
1533
1534
1535 /*
1536 * The three way handshake has completed - we got a valid synack -
1537 * now create the new socket.
1538 */
tcp_v4_syn_recv_sock(struct sock * sk,struct sk_buff * skb,struct open_request * req,struct dst_entry * dst)1539 struct sock * tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1540 struct open_request *req,
1541 struct dst_entry *dst)
1542 {
1543 struct tcp_opt *newtp;
1544 struct sock *newsk;
1545
1546 if (tcp_acceptq_is_full(sk))
1547 goto exit_overflow;
1548
1549 if (dst == NULL &&
1550 (dst = tcp_v4_route_req(sk, req)) == NULL)
1551 goto exit;
1552
1553 newsk = tcp_create_openreq_child(sk, req, skb);
1554 if (!newsk)
1555 goto exit;
1556
1557 newsk->dst_cache = dst;
1558 newsk->route_caps = dst->dev->features;
1559
1560 newtp = &(newsk->tp_pinfo.af_tcp);
1561 newsk->daddr = req->af.v4_req.rmt_addr;
1562 newsk->saddr = req->af.v4_req.loc_addr;
1563 newsk->rcv_saddr = req->af.v4_req.loc_addr;
1564 newsk->protinfo.af_inet.opt = req->af.v4_req.opt;
1565 req->af.v4_req.opt = NULL;
1566 newsk->protinfo.af_inet.mc_index = tcp_v4_iif(skb);
1567 newsk->protinfo.af_inet.mc_ttl = skb->nh.iph->ttl;
1568 newtp->ext_header_len = 0;
1569 if (newsk->protinfo.af_inet.opt)
1570 newtp->ext_header_len = newsk->protinfo.af_inet.opt->optlen;
1571 newsk->protinfo.af_inet.id = newtp->write_seq^jiffies;
1572
1573 tcp_sync_mss(newsk, dst->pmtu);
1574 newtp->advmss = dst->advmss;
1575 tcp_initialize_rcv_mss(newsk);
1576
1577 __tcp_v4_hash(newsk, 0);
1578 __tcp_inherit_port(sk, newsk);
1579
1580 return newsk;
1581
1582 exit_overflow:
1583 NET_INC_STATS_BH(ListenOverflows);
1584 exit:
1585 NET_INC_STATS_BH(ListenDrops);
1586 dst_release(dst);
1587 return NULL;
1588 }
1589
tcp_v4_hnd_req(struct sock * sk,struct sk_buff * skb)1590 static struct sock *tcp_v4_hnd_req(struct sock *sk,struct sk_buff *skb)
1591 {
1592 struct open_request *req, **prev;
1593 struct tcphdr *th = skb->h.th;
1594 struct iphdr *iph = skb->nh.iph;
1595 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1596 struct sock *nsk;
1597
1598 /* Find possible connection requests. */
1599 req = tcp_v4_search_req(tp, &prev,
1600 th->source,
1601 iph->saddr, iph->daddr);
1602 if (req)
1603 return tcp_check_req(sk, skb, req, prev);
1604
1605 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1606 th->source,
1607 skb->nh.iph->daddr,
1608 ntohs(th->dest),
1609 tcp_v4_iif(skb));
1610
1611 if (nsk) {
1612 if (nsk->state != TCP_TIME_WAIT) {
1613 bh_lock_sock(nsk);
1614 return nsk;
1615 }
1616 tcp_tw_put((struct tcp_tw_bucket*)nsk);
1617 return NULL;
1618 }
1619
1620 #ifdef CONFIG_SYN_COOKIES
1621 if (!th->rst && !th->syn && th->ack)
1622 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1623 #endif
1624 return sk;
1625 }
1626
tcp_v4_checksum_init(struct sk_buff * skb)1627 static int tcp_v4_checksum_init(struct sk_buff *skb)
1628 {
1629 if (skb->ip_summed == CHECKSUM_HW) {
1630 skb->ip_summed = CHECKSUM_UNNECESSARY;
1631 if (!tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1632 skb->nh.iph->daddr,skb->csum))
1633 return 0;
1634
1635 NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1636 skb->ip_summed = CHECKSUM_NONE;
1637 }
1638 if (skb->len <= 76) {
1639 if (tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1640 skb->nh.iph->daddr,
1641 skb_checksum(skb, 0, skb->len, 0)))
1642 return -1;
1643 skb->ip_summed = CHECKSUM_UNNECESSARY;
1644 } else {
1645 skb->csum = ~tcp_v4_check(skb->h.th,skb->len,skb->nh.iph->saddr,
1646 skb->nh.iph->daddr,0);
1647 }
1648 return 0;
1649 }
1650
1651
1652 /* The socket must have it's spinlock held when we get
1653 * here.
1654 *
1655 * We have a potential double-lock case here, so even when
1656 * doing backlog processing we use the BH locking scheme.
1657 * This is because we cannot sleep with the original spinlock
1658 * held.
1659 */
tcp_v4_do_rcv(struct sock * sk,struct sk_buff * skb)1660 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1661 {
1662 IP_INC_STATS_BH(IpInDelivers);
1663
1664 if (sk->state == TCP_ESTABLISHED) { /* Fast path */
1665 TCP_CHECK_TIMER(sk);
1666 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1667 goto reset;
1668 TCP_CHECK_TIMER(sk);
1669 return 0;
1670 }
1671
1672 if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb))
1673 goto csum_err;
1674
1675 if (sk->state == TCP_LISTEN) {
1676 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1677 if (!nsk)
1678 goto discard;
1679
1680 if (nsk != sk) {
1681 if (tcp_child_process(sk, nsk, skb))
1682 goto reset;
1683 return 0;
1684 }
1685 }
1686
1687 TCP_CHECK_TIMER(sk);
1688 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1689 goto reset;
1690 TCP_CHECK_TIMER(sk);
1691 return 0;
1692
1693 reset:
1694 tcp_v4_send_reset(skb);
1695 discard:
1696 kfree_skb(skb);
1697 /* Be careful here. If this function gets more complicated and
1698 * gcc suffers from register pressure on the x86, sk (in %ebx)
1699 * might be destroyed here. This current version compiles correctly,
1700 * but you have been warned.
1701 */
1702 return 0;
1703
1704 csum_err:
1705 TCP_INC_STATS_BH(TcpInErrs);
1706 goto discard;
1707 }
1708
1709 /*
1710 * From tcp_input.c
1711 */
1712
tcp_v4_rcv(struct sk_buff * skb)1713 int tcp_v4_rcv(struct sk_buff *skb)
1714 {
1715 struct tcphdr *th;
1716 struct sock *sk;
1717 int ret;
1718
1719 if (skb->pkt_type!=PACKET_HOST)
1720 goto discard_it;
1721
1722 /* Count it even if it's bad */
1723 TCP_INC_STATS_BH(TcpInSegs);
1724
1725 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1726 goto discard_it;
1727
1728 th = skb->h.th;
1729
1730 if (th->doff < sizeof(struct tcphdr)/4)
1731 goto bad_packet;
1732 if (!pskb_may_pull(skb, th->doff*4))
1733 goto discard_it;
1734
1735 /* An explanation is required here, I think.
1736 * Packet length and doff are validated by header prediction,
1737 * provided case of th->doff==0 is elimineted.
1738 * So, we defer the checks. */
1739 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1740 tcp_v4_checksum_init(skb) < 0))
1741 goto bad_packet;
1742
1743 th = skb->h.th;
1744 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1745 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1746 skb->len - th->doff*4);
1747 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1748 TCP_SKB_CB(skb)->when = 0;
1749 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1750 TCP_SKB_CB(skb)->sacked = 0;
1751
1752 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1753 skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1754
1755 if (!sk)
1756 goto no_tcp_socket;
1757
1758 process:
1759 if(!ipsec_sk_policy(sk,skb))
1760 goto discard_and_relse;
1761
1762 if (sk->state == TCP_TIME_WAIT)
1763 goto do_time_wait;
1764
1765 if (sk_filter(sk, skb, 0))
1766 goto discard_and_relse;
1767
1768 skb->dev = NULL;
1769
1770 bh_lock_sock(sk);
1771 ret = 0;
1772 if (!sk->lock.users) {
1773 if (!tcp_prequeue(sk, skb))
1774 ret = tcp_v4_do_rcv(sk, skb);
1775 } else
1776 sk_add_backlog(sk, skb);
1777 bh_unlock_sock(sk);
1778
1779 sock_put(sk);
1780
1781 return ret;
1782
1783 no_tcp_socket:
1784 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1785 bad_packet:
1786 TCP_INC_STATS_BH(TcpInErrs);
1787 } else {
1788 tcp_v4_send_reset(skb);
1789 }
1790
1791 discard_it:
1792 /* Discard frame. */
1793 kfree_skb(skb);
1794 return 0;
1795
1796 discard_and_relse:
1797 sock_put(sk);
1798 goto discard_it;
1799
1800 do_time_wait:
1801 if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) {
1802 TCP_INC_STATS_BH(TcpInErrs);
1803 tcp_tw_put((struct tcp_tw_bucket *) sk);
1804 goto discard_it;
1805 }
1806 switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1807 skb, th, skb->len)) {
1808 case TCP_TW_SYN:
1809 {
1810 struct sock *sk2;
1811
1812 sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), tcp_v4_iif(skb));
1813 if (sk2 != NULL) {
1814 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1815 tcp_timewait_kill((struct tcp_tw_bucket *)sk);
1816 tcp_tw_put((struct tcp_tw_bucket *)sk);
1817 sk = sk2;
1818 goto process;
1819 }
1820 /* Fall through to ACK */
1821 }
1822 case TCP_TW_ACK:
1823 tcp_v4_timewait_ack(sk, skb);
1824 break;
1825 case TCP_TW_RST:
1826 goto no_tcp_socket;
1827 case TCP_TW_SUCCESS:;
1828 }
1829 goto discard_it;
1830 }
1831
1832 /* With per-bucket locks this operation is not-atomic, so that
1833 * this version is not worse.
1834 */
__tcp_v4_rehash(struct sock * sk)1835 static void __tcp_v4_rehash(struct sock *sk)
1836 {
1837 sk->prot->unhash(sk);
1838 sk->prot->hash(sk);
1839 }
1840
tcp_v4_reselect_saddr(struct sock * sk)1841 static int tcp_v4_reselect_saddr(struct sock *sk)
1842 {
1843 int err;
1844 struct rtable *rt;
1845 __u32 old_saddr = sk->saddr;
1846 __u32 new_saddr;
1847 __u32 daddr = sk->daddr;
1848
1849 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1850 daddr = sk->protinfo.af_inet.opt->faddr;
1851
1852 /* Query new route. */
1853 err = ip_route_connect(&rt, daddr, 0,
1854 RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute,
1855 sk->bound_dev_if);
1856 if (err)
1857 return err;
1858
1859 __sk_dst_set(sk, &rt->u.dst);
1860 sk->route_caps = rt->u.dst.dev->features;
1861
1862 new_saddr = rt->rt_src;
1863
1864 if (new_saddr == old_saddr)
1865 return 0;
1866
1867 if (sysctl_ip_dynaddr > 1) {
1868 printk(KERN_INFO "tcp_v4_rebuild_header(): shifting sk->saddr "
1869 "from %d.%d.%d.%d to %d.%d.%d.%d\n",
1870 NIPQUAD(old_saddr),
1871 NIPQUAD(new_saddr));
1872 }
1873
1874 sk->saddr = new_saddr;
1875 sk->rcv_saddr = new_saddr;
1876
1877 /* XXX The only one ugly spot where we need to
1878 * XXX really change the sockets identity after
1879 * XXX it has entered the hashes. -DaveM
1880 *
1881 * Besides that, it does not check for connection
1882 * uniqueness. Wait for troubles.
1883 */
1884 __tcp_v4_rehash(sk);
1885 return 0;
1886 }
1887
tcp_v4_rebuild_header(struct sock * sk)1888 int tcp_v4_rebuild_header(struct sock *sk)
1889 {
1890 struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0);
1891 u32 daddr;
1892 int err;
1893
1894 /* Route is OK, nothing to do. */
1895 if (rt != NULL)
1896 return 0;
1897
1898 /* Reroute. */
1899 daddr = sk->daddr;
1900 if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr)
1901 daddr = sk->protinfo.af_inet.opt->faddr;
1902
1903 err = ip_route_output(&rt, daddr, sk->saddr,
1904 RT_CONN_FLAGS(sk), sk->bound_dev_if);
1905 if (!err) {
1906 __sk_dst_set(sk, &rt->u.dst);
1907 sk->route_caps = rt->u.dst.dev->features;
1908 return 0;
1909 }
1910
1911 /* Routing failed... */
1912 sk->route_caps = 0;
1913
1914 if (!sysctl_ip_dynaddr ||
1915 sk->state != TCP_SYN_SENT ||
1916 (sk->userlocks & SOCK_BINDADDR_LOCK) ||
1917 (err = tcp_v4_reselect_saddr(sk)) != 0)
1918 sk->err_soft=-err;
1919
1920 return err;
1921 }
1922
v4_addr2sockaddr(struct sock * sk,struct sockaddr * uaddr)1923 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1924 {
1925 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1926
1927 sin->sin_family = AF_INET;
1928 sin->sin_addr.s_addr = sk->daddr;
1929 sin->sin_port = sk->dport;
1930 }
1931
1932 /* VJ's idea. Save last timestamp seen from this destination
1933 * and hold it at least for normal timewait interval to use for duplicate
1934 * segment detection in subsequent connections, before they enter synchronized
1935 * state.
1936 */
1937
tcp_v4_remember_stamp(struct sock * sk)1938 int tcp_v4_remember_stamp(struct sock *sk)
1939 {
1940 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
1941 struct rtable *rt = (struct rtable*)__sk_dst_get(sk);
1942 struct inet_peer *peer = NULL;
1943 int release_it = 0;
1944
1945 if (rt == NULL || rt->rt_dst != sk->daddr) {
1946 peer = inet_getpeer(sk->daddr, 1);
1947 release_it = 1;
1948 } else {
1949 if (rt->peer == NULL)
1950 rt_bind_peer(rt, 1);
1951 peer = rt->peer;
1952 }
1953
1954 if (peer) {
1955 if ((s32)(peer->tcp_ts - tp->ts_recent) <= 0 ||
1956 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1957 peer->tcp_ts_stamp <= tp->ts_recent_stamp)) {
1958 peer->tcp_ts_stamp = tp->ts_recent_stamp;
1959 peer->tcp_ts = tp->ts_recent;
1960 }
1961 if (release_it)
1962 inet_putpeer(peer);
1963 return 1;
1964 }
1965
1966 return 0;
1967 }
1968
tcp_v4_tw_remember_stamp(struct tcp_tw_bucket * tw)1969 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1970 {
1971 struct inet_peer *peer = NULL;
1972
1973 peer = inet_getpeer(tw->daddr, 1);
1974
1975 if (peer) {
1976 if ((s32)(peer->tcp_ts - tw->ts_recent) <= 0 ||
1977 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1978 peer->tcp_ts_stamp <= tw->ts_recent_stamp)) {
1979 peer->tcp_ts_stamp = tw->ts_recent_stamp;
1980 peer->tcp_ts = tw->ts_recent;
1981 }
1982 inet_putpeer(peer);
1983 return 1;
1984 }
1985
1986 return 0;
1987 }
1988
1989 struct tcp_func ipv4_specific = {
1990 ip_queue_xmit,
1991 tcp_v4_send_check,
1992 tcp_v4_rebuild_header,
1993 tcp_v4_conn_request,
1994 tcp_v4_syn_recv_sock,
1995 tcp_v4_remember_stamp,
1996 sizeof(struct iphdr),
1997
1998 ip_setsockopt,
1999 ip_getsockopt,
2000 v4_addr2sockaddr,
2001 sizeof(struct sockaddr_in)
2002 };
2003
2004 /* NOTE: A lot of things set to zero explicitly by call to
2005 * sk_alloc() so need not be done here.
2006 */
tcp_v4_init_sock(struct sock * sk)2007 static int tcp_v4_init_sock(struct sock *sk)
2008 {
2009 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2010
2011 skb_queue_head_init(&tp->out_of_order_queue);
2012 tcp_init_xmit_timers(sk);
2013 tcp_prequeue_init(tp);
2014
2015 tp->rto = TCP_TIMEOUT_INIT;
2016 tp->mdev = TCP_TIMEOUT_INIT;
2017
2018 /* So many TCP implementations out there (incorrectly) count the
2019 * initial SYN frame in their delayed-ACK and congestion control
2020 * algorithms that we must have the following bandaid to talk
2021 * efficiently to them. -DaveM
2022 */
2023 tp->snd_cwnd = 2;
2024
2025 /* See draft-stevens-tcpca-spec-01 for discussion of the
2026 * initialization of these values.
2027 */
2028 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
2029 tp->snd_cwnd_clamp = ~0;
2030 tp->mss_cache = 536;
2031
2032 tp->reordering = sysctl_tcp_reordering;
2033
2034 sk->state = TCP_CLOSE;
2035
2036 sk->write_space = tcp_write_space;
2037 sk->use_write_queue = 1;
2038
2039 sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
2040
2041 sk->sndbuf = sysctl_tcp_wmem[1];
2042 sk->rcvbuf = sysctl_tcp_rmem[1];
2043
2044 atomic_inc(&tcp_sockets_allocated);
2045
2046 return 0;
2047 }
2048
tcp_v4_destroy_sock(struct sock * sk)2049 static int tcp_v4_destroy_sock(struct sock *sk)
2050 {
2051 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2052
2053 tcp_clear_xmit_timers(sk);
2054
2055 /* Cleanup up the write buffer. */
2056 tcp_writequeue_purge(sk);
2057
2058 /* Cleans up our, hopefully empty, out_of_order_queue. */
2059 __skb_queue_purge(&tp->out_of_order_queue);
2060
2061 /* Clean prequeue, it must be empty really */
2062 __skb_queue_purge(&tp->ucopy.prequeue);
2063
2064 /* Clean up a referenced TCP bind bucket. */
2065 if(sk->prev != NULL)
2066 tcp_put_port(sk);
2067
2068 /* If sendmsg cached page exists, toss it. */
2069 if (tp->sndmsg_page != NULL)
2070 __free_page(tp->sndmsg_page);
2071
2072 atomic_dec(&tcp_sockets_allocated);
2073
2074 return 0;
2075 }
2076
2077 /* Proc filesystem TCP sock list dumping. */
get_openreq(struct sock * sk,struct open_request * req,char * tmpbuf,int i,int uid)2078 static void get_openreq(struct sock *sk, struct open_request *req, char *tmpbuf, int i, int uid)
2079 {
2080 int ttd = req->expires - jiffies;
2081
2082 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2083 " %02X %08X:%08X %02X:%08X %08X %5d %8d %u %d %p",
2084 i,
2085 req->af.v4_req.loc_addr,
2086 ntohs(sk->sport),
2087 req->af.v4_req.rmt_addr,
2088 ntohs(req->rmt_port),
2089 TCP_SYN_RECV,
2090 0,0, /* could print option size, but that is af dependent. */
2091 1, /* timers active (only the expire timer) */
2092 ttd,
2093 req->retrans,
2094 uid,
2095 0, /* non standard timer */
2096 0, /* open_requests have no inode */
2097 atomic_read(&sk->refcnt),
2098 req
2099 );
2100 }
2101
get_tcp_sock(struct sock * sp,char * tmpbuf,int i)2102 static void get_tcp_sock(struct sock *sp, char *tmpbuf, int i)
2103 {
2104 unsigned int dest, src;
2105 __u16 destp, srcp;
2106 int timer_active;
2107 unsigned long timer_expires;
2108 struct tcp_opt *tp = &sp->tp_pinfo.af_tcp;
2109
2110 dest = sp->daddr;
2111 src = sp->rcv_saddr;
2112 destp = ntohs(sp->dport);
2113 srcp = ntohs(sp->sport);
2114 if (tp->pending == TCP_TIME_RETRANS) {
2115 timer_active = 1;
2116 timer_expires = tp->timeout;
2117 } else if (tp->pending == TCP_TIME_PROBE0) {
2118 timer_active = 4;
2119 timer_expires = tp->timeout;
2120 } else if (timer_pending(&sp->timer)) {
2121 timer_active = 2;
2122 timer_expires = sp->timer.expires;
2123 } else {
2124 timer_active = 0;
2125 timer_expires = jiffies;
2126 }
2127
2128 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2129 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d",
2130 i, src, srcp, dest, destp, sp->state,
2131 tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq,
2132 timer_active, timer_expires-jiffies,
2133 tp->retransmits,
2134 sock_i_uid(sp),
2135 tp->probes_out,
2136 sock_i_ino(sp),
2137 atomic_read(&sp->refcnt), sp,
2138 tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong,
2139 tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh
2140 );
2141 }
2142
get_timewait_sock(struct tcp_tw_bucket * tw,char * tmpbuf,int i)2143 static void get_timewait_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2144 {
2145 unsigned int dest, src;
2146 __u16 destp, srcp;
2147 int ttd = tw->ttd - jiffies;
2148
2149 if (ttd < 0)
2150 ttd = 0;
2151
2152 dest = tw->daddr;
2153 src = tw->rcv_saddr;
2154 destp = ntohs(tw->dport);
2155 srcp = ntohs(tw->sport);
2156
2157 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2158 " %02X %08X:%08X %02X:%08X %08X %5d %8d %d %d %p",
2159 i, src, srcp, dest, destp, tw->substate, 0, 0,
2160 3, ttd, 0, 0, 0, 0,
2161 atomic_read(&tw->refcnt), tw);
2162 }
2163
2164 #define TMPSZ 150
2165
tcp_get_info(char * buffer,char ** start,off_t offset,int length)2166 int tcp_get_info(char *buffer, char **start, off_t offset, int length)
2167 {
2168 int len = 0, num = 0, i;
2169 off_t begin, pos = 0;
2170 char tmpbuf[TMPSZ+1];
2171
2172 if (offset < TMPSZ)
2173 len += sprintf(buffer, "%-*s\n", TMPSZ-1,
2174 " sl local_address rem_address st tx_queue "
2175 "rx_queue tr tm->when retrnsmt uid timeout inode");
2176
2177 pos = TMPSZ;
2178
2179 /* First, walk listening socket table. */
2180 tcp_listen_lock();
2181 for(i = 0; i < TCP_LHTABLE_SIZE; i++) {
2182 struct sock *sk;
2183 struct tcp_listen_opt *lopt;
2184 int k;
2185
2186 for (sk = tcp_listening_hash[i]; sk; sk = sk->next, num++) {
2187 struct open_request *req;
2188 int uid;
2189 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2190
2191 if (!TCP_INET_FAMILY(sk->family))
2192 goto skip_listen;
2193
2194 pos += TMPSZ;
2195 if (pos >= offset) {
2196 get_tcp_sock(sk, tmpbuf, num);
2197 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2198 if (pos >= offset + length) {
2199 tcp_listen_unlock();
2200 goto out_no_bh;
2201 }
2202 }
2203
2204 skip_listen:
2205 uid = sock_i_uid(sk);
2206 read_lock_bh(&tp->syn_wait_lock);
2207 lopt = tp->listen_opt;
2208 if (lopt && lopt->qlen != 0) {
2209 for (k=0; k<TCP_SYNQ_HSIZE; k++) {
2210 for (req = lopt->syn_table[k]; req; req = req->dl_next, num++) {
2211 if (!TCP_INET_FAMILY(req->class->family))
2212 continue;
2213
2214 pos += TMPSZ;
2215 if (pos <= offset)
2216 continue;
2217 get_openreq(sk, req, tmpbuf, num, uid);
2218 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2219 if (pos >= offset + length) {
2220 read_unlock_bh(&tp->syn_wait_lock);
2221 tcp_listen_unlock();
2222 goto out_no_bh;
2223 }
2224 }
2225 }
2226 }
2227 read_unlock_bh(&tp->syn_wait_lock);
2228
2229 /* Completed requests are in normal socket hash table */
2230 }
2231 }
2232 tcp_listen_unlock();
2233
2234 local_bh_disable();
2235
2236 /* Next, walk established hash chain. */
2237 for (i = 0; i < tcp_ehash_size; i++) {
2238 struct tcp_ehash_bucket *head = &tcp_ehash[i];
2239 struct sock *sk;
2240 struct tcp_tw_bucket *tw;
2241
2242 read_lock(&head->lock);
2243 for(sk = head->chain; sk; sk = sk->next, num++) {
2244 if (!TCP_INET_FAMILY(sk->family))
2245 continue;
2246 pos += TMPSZ;
2247 if (pos <= offset)
2248 continue;
2249 get_tcp_sock(sk, tmpbuf, num);
2250 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2251 if (pos >= offset + length) {
2252 read_unlock(&head->lock);
2253 goto out;
2254 }
2255 }
2256 for (tw = (struct tcp_tw_bucket *)tcp_ehash[i+tcp_ehash_size].chain;
2257 tw != NULL;
2258 tw = (struct tcp_tw_bucket *)tw->next, num++) {
2259 if (!TCP_INET_FAMILY(tw->family))
2260 continue;
2261 pos += TMPSZ;
2262 if (pos <= offset)
2263 continue;
2264 get_timewait_sock(tw, tmpbuf, num);
2265 len += sprintf(buffer+len, "%-*s\n", TMPSZ-1, tmpbuf);
2266 if (pos >= offset + length) {
2267 read_unlock(&head->lock);
2268 goto out;
2269 }
2270 }
2271 read_unlock(&head->lock);
2272 }
2273
2274 out:
2275 local_bh_enable();
2276 out_no_bh:
2277
2278 begin = len - (pos - offset);
2279 *start = buffer + begin;
2280 len -= begin;
2281 if (len > length)
2282 len = length;
2283 if (len < 0)
2284 len = 0;
2285 return len;
2286 }
2287
2288 struct proto tcp_prot = {
2289 name: "TCP",
2290 close: tcp_close,
2291 connect: tcp_v4_connect,
2292 disconnect: tcp_disconnect,
2293 accept: tcp_accept,
2294 ioctl: tcp_ioctl,
2295 init: tcp_v4_init_sock,
2296 destroy: tcp_v4_destroy_sock,
2297 shutdown: tcp_shutdown,
2298 setsockopt: tcp_setsockopt,
2299 getsockopt: tcp_getsockopt,
2300 sendmsg: tcp_sendmsg,
2301 recvmsg: tcp_recvmsg,
2302 backlog_rcv: tcp_v4_do_rcv,
2303 hash: tcp_v4_hash,
2304 unhash: tcp_unhash,
2305 get_port: tcp_v4_get_port,
2306 };
2307
2308
2309
tcp_v4_init(struct net_proto_family * ops)2310 void __init tcp_v4_init(struct net_proto_family *ops)
2311 {
2312 int err;
2313
2314 tcp_inode.i_mode = S_IFSOCK;
2315 tcp_inode.i_sock = 1;
2316 tcp_inode.i_uid = 0;
2317 tcp_inode.i_gid = 0;
2318 init_waitqueue_head(&tcp_inode.i_wait);
2319 init_waitqueue_head(&tcp_inode.u.socket_i.wait);
2320
2321 tcp_socket->inode = &tcp_inode;
2322 tcp_socket->state = SS_UNCONNECTED;
2323 tcp_socket->type=SOCK_RAW;
2324
2325 if ((err=ops->create(tcp_socket, IPPROTO_TCP))<0)
2326 panic("Failed to create the TCP control socket.\n");
2327 tcp_socket->sk->allocation=GFP_ATOMIC;
2328 tcp_socket->sk->protinfo.af_inet.ttl = MAXTTL;
2329
2330 /* Unhash it so that IP input processing does not even
2331 * see it, we do not wish this socket to see incoming
2332 * packets.
2333 */
2334 tcp_socket->sk->prot->unhash(tcp_socket->sk);
2335 }
2336