1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Implementation of the Transmission Control Protocol(TCP).
7 *
8 * Version: $Id: tcp.c,v 1.215 2001/10/31 08:17:58 davem Exp $
9 *
10 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 * Corey Minyard <wf-rch!minyard@relay.EU.net>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16 * Linus Torvalds, <torvalds@cs.helsinki.fi>
17 * Alan Cox, <gw4pts@gw4pts.ampr.org>
18 * Matthew Dillon, <dillon@apollo.west.oic.com>
19 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20 * Jorge Cwik, <jorge@laser.satlink.net>
21 *
22 * Fixes:
23 * Alan Cox : Numerous verify_area() calls
24 * Alan Cox : Set the ACK bit on a reset
25 * Alan Cox : Stopped it crashing if it closed while
26 * sk->inuse=1 and was trying to connect
27 * (tcp_err()).
28 * Alan Cox : All icmp error handling was broken
29 * pointers passed where wrong and the
30 * socket was looked up backwards. Nobody
31 * tested any icmp error code obviously.
32 * Alan Cox : tcp_err() now handled properly. It
33 * wakes people on errors. poll
34 * behaves and the icmp error race
35 * has gone by moving it into sock.c
36 * Alan Cox : tcp_send_reset() fixed to work for
37 * everything not just packets for
38 * unknown sockets.
39 * Alan Cox : tcp option processing.
40 * Alan Cox : Reset tweaked (still not 100%) [Had
41 * syn rule wrong]
42 * Herp Rosmanith : More reset fixes
43 * Alan Cox : No longer acks invalid rst frames.
44 * Acking any kind of RST is right out.
45 * Alan Cox : Sets an ignore me flag on an rst
46 * receive otherwise odd bits of prattle
47 * escape still
48 * Alan Cox : Fixed another acking RST frame bug.
49 * Should stop LAN workplace lockups.
50 * Alan Cox : Some tidyups using the new skb list
51 * facilities
52 * Alan Cox : sk->keepopen now seems to work
53 * Alan Cox : Pulls options out correctly on accepts
54 * Alan Cox : Fixed assorted sk->rqueue->next errors
55 * Alan Cox : PSH doesn't end a TCP read. Switched a
56 * bit to skb ops.
57 * Alan Cox : Tidied tcp_data to avoid a potential
58 * nasty.
59 * Alan Cox : Added some better commenting, as the
60 * tcp is hard to follow
61 * Alan Cox : Removed incorrect check for 20 * psh
62 * Michael O'Reilly : ack < copied bug fix.
63 * Johannes Stille : Misc tcp fixes (not all in yet).
64 * Alan Cox : FIN with no memory -> CRASH
65 * Alan Cox : Added socket option proto entries.
66 * Also added awareness of them to accept.
67 * Alan Cox : Added TCP options (SOL_TCP)
68 * Alan Cox : Switched wakeup calls to callbacks,
69 * so the kernel can layer network
70 * sockets.
71 * Alan Cox : Use ip_tos/ip_ttl settings.
72 * Alan Cox : Handle FIN (more) properly (we hope).
73 * Alan Cox : RST frames sent on unsynchronised
74 * state ack error.
75 * Alan Cox : Put in missing check for SYN bit.
76 * Alan Cox : Added tcp_select_window() aka NET2E
77 * window non shrink trick.
78 * Alan Cox : Added a couple of small NET2E timer
79 * fixes
80 * Charles Hedrick : TCP fixes
81 * Toomas Tamm : TCP window fixes
82 * Alan Cox : Small URG fix to rlogin ^C ack fight
83 * Charles Hedrick : Rewrote most of it to actually work
84 * Linus : Rewrote tcp_read() and URG handling
85 * completely
86 * Gerhard Koerting: Fixed some missing timer handling
87 * Matthew Dillon : Reworked TCP machine states as per RFC
88 * Gerhard Koerting: PC/TCP workarounds
89 * Adam Caldwell : Assorted timer/timing errors
90 * Matthew Dillon : Fixed another RST bug
91 * Alan Cox : Move to kernel side addressing changes.
92 * Alan Cox : Beginning work on TCP fastpathing
93 * (not yet usable)
94 * Arnt Gulbrandsen: Turbocharged tcp_check() routine.
95 * Alan Cox : TCP fast path debugging
96 * Alan Cox : Window clamping
97 * Michael Riepe : Bug in tcp_check()
98 * Matt Dillon : More TCP improvements and RST bug fixes
99 * Matt Dillon : Yet more small nasties remove from the
100 * TCP code (Be very nice to this man if
101 * tcp finally works 100%) 8)
102 * Alan Cox : BSD accept semantics.
103 * Alan Cox : Reset on closedown bug.
104 * Peter De Schrijver : ENOTCONN check missing in tcp_sendto().
105 * Michael Pall : Handle poll() after URG properly in
106 * all cases.
107 * Michael Pall : Undo the last fix in tcp_read_urg()
108 * (multi URG PUSH broke rlogin).
109 * Michael Pall : Fix the multi URG PUSH problem in
110 * tcp_readable(), poll() after URG
111 * works now.
112 * Michael Pall : recv(...,MSG_OOB) never blocks in the
113 * BSD api.
114 * Alan Cox : Changed the semantics of sk->socket to
115 * fix a race and a signal problem with
116 * accept() and async I/O.
117 * Alan Cox : Relaxed the rules on tcp_sendto().
118 * Yury Shevchuk : Really fixed accept() blocking problem.
119 * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for
120 * clients/servers which listen in on
121 * fixed ports.
122 * Alan Cox : Cleaned the above up and shrank it to
123 * a sensible code size.
124 * Alan Cox : Self connect lockup fix.
125 * Alan Cox : No connect to multicast.
126 * Ross Biro : Close unaccepted children on master
127 * socket close.
128 * Alan Cox : Reset tracing code.
129 * Alan Cox : Spurious resets on shutdown.
130 * Alan Cox : Giant 15 minute/60 second timer error
131 * Alan Cox : Small whoops in polling before an
132 * accept.
133 * Alan Cox : Kept the state trace facility since
134 * it's handy for debugging.
135 * Alan Cox : More reset handler fixes.
136 * Alan Cox : Started rewriting the code based on
137 * the RFC's for other useful protocol
138 * references see: Comer, KA9Q NOS, and
139 * for a reference on the difference
140 * between specifications and how BSD
141 * works see the 4.4lite source.
142 * A.N.Kuznetsov : Don't time wait on completion of tidy
143 * close.
144 * Linus Torvalds : Fin/Shutdown & copied_seq changes.
145 * Linus Torvalds : Fixed BSD port reuse to work first syn
146 * Alan Cox : Reimplemented timers as per the RFC
147 * and using multiple timers for sanity.
148 * Alan Cox : Small bug fixes, and a lot of new
149 * comments.
150 * Alan Cox : Fixed dual reader crash by locking
151 * the buffers (much like datagram.c)
152 * Alan Cox : Fixed stuck sockets in probe. A probe
153 * now gets fed up of retrying without
154 * (even a no space) answer.
155 * Alan Cox : Extracted closing code better
156 * Alan Cox : Fixed the closing state machine to
157 * resemble the RFC.
158 * Alan Cox : More 'per spec' fixes.
159 * Jorge Cwik : Even faster checksumming.
160 * Alan Cox : tcp_data() doesn't ack illegal PSH
161 * only frames. At least one pc tcp stack
162 * generates them.
163 * Alan Cox : Cache last socket.
164 * Alan Cox : Per route irtt.
165 * Matt Day : poll()->select() match BSD precisely on error
166 * Alan Cox : New buffers
167 * Marc Tamsky : Various sk->prot->retransmits and
168 * sk->retransmits misupdating fixed.
169 * Fixed tcp_write_timeout: stuck close,
170 * and TCP syn retries gets used now.
171 * Mark Yarvis : In tcp_read_wakeup(), don't send an
172 * ack if state is TCP_CLOSED.
173 * Alan Cox : Look up device on a retransmit - routes may
174 * change. Doesn't yet cope with MSS shrink right
175 * but its a start!
176 * Marc Tamsky : Closing in closing fixes.
177 * Mike Shaver : RFC1122 verifications.
178 * Alan Cox : rcv_saddr errors.
179 * Alan Cox : Block double connect().
180 * Alan Cox : Small hooks for enSKIP.
181 * Alexey Kuznetsov: Path MTU discovery.
182 * Alan Cox : Support soft errors.
183 * Alan Cox : Fix MTU discovery pathological case
184 * when the remote claims no mtu!
185 * Marc Tamsky : TCP_CLOSE fix.
186 * Colin (G3TNE) : Send a reset on syn ack replies in
187 * window but wrong (fixes NT lpd problems)
188 * Pedro Roque : Better TCP window handling, delayed ack.
189 * Joerg Reuter : No modification of locked buffers in
190 * tcp_do_retransmit()
191 * Eric Schenk : Changed receiver side silly window
192 * avoidance algorithm to BSD style
193 * algorithm. This doubles throughput
194 * against machines running Solaris,
195 * and seems to result in general
196 * improvement.
197 * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD
198 * Willy Konynenberg : Transparent proxying support.
199 * Mike McLagan : Routing by source
200 * Keith Owens : Do proper merging with partial SKB's in
201 * tcp_do_sendmsg to avoid burstiness.
202 * Eric Schenk : Fix fast close down bug with
203 * shutdown() followed by close().
204 * Andi Kleen : Make poll agree with SIGIO
205 * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and
206 * lingertime == 0 (RFC 793 ABORT Call)
207 *
208 * This program is free software; you can redistribute it and/or
209 * modify it under the terms of the GNU General Public License
210 * as published by the Free Software Foundation; either version
211 * 2 of the License, or(at your option) any later version.
212 *
213 * Description of States:
214 *
215 * TCP_SYN_SENT sent a connection request, waiting for ack
216 *
217 * TCP_SYN_RECV received a connection request, sent ack,
218 * waiting for final ack in three-way handshake.
219 *
220 * TCP_ESTABLISHED connection established
221 *
222 * TCP_FIN_WAIT1 our side has shutdown, waiting to complete
223 * transmission of remaining buffered data
224 *
225 * TCP_FIN_WAIT2 all buffered data sent, waiting for remote
226 * to shutdown
227 *
228 * TCP_CLOSING both sides have shutdown but we still have
229 * data we have to finish sending
230 *
231 * TCP_TIME_WAIT timeout to catch resent junk before entering
232 * closed, can only be entered from FIN_WAIT2
233 * or CLOSING. Required because the other end
234 * may not have gotten our last ACK causing it
235 * to retransmit the data packet (which we ignore)
236 *
237 * TCP_CLOSE_WAIT remote side has shutdown and is waiting for
238 * us to finish writing our data and to shutdown
239 * (we have to close() to move on to LAST_ACK)
240 *
241 * TCP_LAST_ACK out side has shutdown after remote has
242 * shutdown. There may still be data in our
243 * buffer that we have to finish sending
244 *
245 * TCP_CLOSE socket is finished
246 */
247
248 #include <linux/config.h>
249 #include <linux/types.h>
250 #include <linux/fcntl.h>
251 #include <linux/poll.h>
252 #include <linux/init.h>
253 #include <linux/smp_lock.h>
254 #include <linux/fs.h>
255 #include <linux/random.h>
256
257 #include <net/icmp.h>
258 #include <net/tcp.h>
259
260 #include <asm/uaccess.h>
261 #include <asm/ioctls.h>
262
263 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
264
265 struct tcp_mib tcp_statistics[NR_CPUS*2];
266
267 kmem_cache_t *tcp_openreq_cachep;
268 kmem_cache_t *tcp_bucket_cachep;
269 kmem_cache_t *tcp_timewait_cachep;
270
271 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
272
273 int sysctl_tcp_default_win_scale = 0;
274
275 int sysctl_tcp_mem[3];
276 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
277 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
278
279 atomic_t tcp_memory_allocated; /* Current allocated memory. */
280 atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */
281
282 /* Pressure flag: try to collapse.
283 * Technical note: it is used by multiple contexts non atomically.
284 * All the tcp_mem_schedule() is of this nature: accounting
285 * is strict, actions are advisory and have some latency. */
286 int tcp_memory_pressure;
287
288 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
289
tcp_mem_schedule(struct sock * sk,int size,int kind)290 int tcp_mem_schedule(struct sock *sk, int size, int kind)
291 {
292 int amt = TCP_PAGES(size);
293
294 sk->forward_alloc += amt*TCP_MEM_QUANTUM;
295 atomic_add(amt, &tcp_memory_allocated);
296
297 /* Under limit. */
298 if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
299 if (tcp_memory_pressure)
300 tcp_memory_pressure = 0;
301 return 1;
302 }
303
304 /* Over hard limit. */
305 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
306 tcp_enter_memory_pressure();
307 goto suppress_allocation;
308 }
309
310 /* Under pressure. */
311 if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
312 tcp_enter_memory_pressure();
313
314 if (kind) {
315 if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
316 return 1;
317 } else {
318 if (sk->wmem_queued < sysctl_tcp_wmem[0])
319 return 1;
320 }
321
322 if (!tcp_memory_pressure ||
323 sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
324 * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
325 sk->forward_alloc))
326 return 1;
327
328 suppress_allocation:
329
330 if (kind == 0) {
331 tcp_moderate_sndbuf(sk);
332
333 /* Fail only if socket is _under_ its sndbuf.
334 * In this case we cannot block, so that we have to fail.
335 */
336 if (sk->wmem_queued+size >= sk->sndbuf)
337 return 1;
338 }
339
340 /* Alas. Undo changes. */
341 sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
342 atomic_sub(amt, &tcp_memory_allocated);
343 return 0;
344 }
345
__tcp_mem_reclaim(struct sock * sk)346 void __tcp_mem_reclaim(struct sock *sk)
347 {
348 if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
349 atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
350 sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
351 if (tcp_memory_pressure &&
352 atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
353 tcp_memory_pressure = 0;
354 }
355 }
356
tcp_rfree(struct sk_buff * skb)357 void tcp_rfree(struct sk_buff *skb)
358 {
359 struct sock *sk = skb->sk;
360
361 atomic_sub(skb->truesize, &sk->rmem_alloc);
362 sk->forward_alloc += skb->truesize;
363 }
364
365 /*
366 * LISTEN is a special case for poll..
367 */
tcp_listen_poll(struct sock * sk,poll_table * wait)368 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
369 {
370 return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
371 }
372
373 /*
374 * Wait for a TCP event.
375 *
376 * Note that we don't need to lock the socket, as the upper poll layers
377 * take care of normal races (between the test and the event) and we don't
378 * go look at any of the socket buffers directly.
379 */
tcp_poll(struct file * file,struct socket * sock,poll_table * wait)380 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
381 {
382 unsigned int mask;
383 struct sock *sk = sock->sk;
384 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
385
386 poll_wait(file, sk->sleep, wait);
387 if (sk->state == TCP_LISTEN)
388 return tcp_listen_poll(sk, wait);
389
390 /* Socket is not locked. We are protected from async events
391 by poll logic and correct handling of state changes
392 made by another threads is impossible in any case.
393 */
394
395 mask = 0;
396 if (sk->err)
397 mask = POLLERR;
398
399 /*
400 * POLLHUP is certainly not done right. But poll() doesn't
401 * have a notion of HUP in just one direction, and for a
402 * socket the read side is more interesting.
403 *
404 * Some poll() documentation says that POLLHUP is incompatible
405 * with the POLLOUT/POLLWR flags, so somebody should check this
406 * all. But careful, it tends to be safer to return too many
407 * bits than too few, and you can easily break real applications
408 * if you don't tell them that something has hung up!
409 *
410 * Check-me.
411 *
412 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
413 * our fs/select.c). It means that after we received EOF,
414 * poll always returns immediately, making impossible poll() on write()
415 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
416 * if and only if shutdown has been made in both directions.
417 * Actually, it is interesting to look how Solaris and DUX
418 * solve this dilemma. I would prefer, if PULLHUP were maskable,
419 * then we could set it on SND_SHUTDOWN. BTW examples given
420 * in Stevens' books assume exactly this behaviour, it explains
421 * why PULLHUP is incompatible with POLLOUT. --ANK
422 *
423 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
424 * blocking on fresh not-connected or disconnected socket. --ANK
425 */
426 if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
427 mask |= POLLHUP;
428 if (sk->shutdown & RCV_SHUTDOWN)
429 mask |= POLLIN | POLLRDNORM;
430
431 /* Connected? */
432 if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
433 /* Potential race condition. If read of tp below will
434 * escape above sk->state, we can be illegally awaken
435 * in SYN_* states. */
436 if ((tp->rcv_nxt != tp->copied_seq) &&
437 (tp->urg_seq != tp->copied_seq ||
438 tp->rcv_nxt != tp->copied_seq+1 ||
439 sk->urginline || !tp->urg_data))
440 mask |= POLLIN | POLLRDNORM;
441
442 if (!(sk->shutdown & SEND_SHUTDOWN)) {
443 if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
444 mask |= POLLOUT | POLLWRNORM;
445 } else { /* send SIGIO later */
446 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
447 set_bit(SOCK_NOSPACE, &sk->socket->flags);
448
449 /* Race breaker. If space is freed after
450 * wspace test but before the flags are set,
451 * IO signal will be lost.
452 */
453 if (tcp_wspace(sk) >= tcp_min_write_space(sk))
454 mask |= POLLOUT | POLLWRNORM;
455 }
456 }
457
458 if (tp->urg_data & TCP_URG_VALID)
459 mask |= POLLPRI;
460 }
461 return mask;
462 }
463
464 /*
465 * TCP socket write_space callback.
466 */
tcp_write_space(struct sock * sk)467 void tcp_write_space(struct sock *sk)
468 {
469 struct socket *sock = sk->socket;
470
471 if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
472 clear_bit(SOCK_NOSPACE, &sock->flags);
473
474 if (sk->sleep && waitqueue_active(sk->sleep))
475 wake_up_interruptible(sk->sleep);
476
477 if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
478 sock_wake_async(sock, 2, POLL_OUT);
479 }
480 }
481
tcp_ioctl(struct sock * sk,int cmd,unsigned long arg)482 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
483 {
484 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
485 int answ;
486
487 switch(cmd) {
488 case SIOCINQ:
489 if (sk->state == TCP_LISTEN)
490 return(-EINVAL);
491
492 lock_sock(sk);
493 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
494 answ = 0;
495 else if (sk->urginline || !tp->urg_data ||
496 before(tp->urg_seq,tp->copied_seq) ||
497 !before(tp->urg_seq,tp->rcv_nxt)) {
498 answ = tp->rcv_nxt - tp->copied_seq;
499
500 /* Subtract 1, if FIN is in queue. */
501 if (answ && !skb_queue_empty(&sk->receive_queue))
502 answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
503 } else
504 answ = tp->urg_seq - tp->copied_seq;
505 release_sock(sk);
506 break;
507 case SIOCATMARK:
508 {
509 answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
510 break;
511 }
512 case SIOCOUTQ:
513 if (sk->state == TCP_LISTEN)
514 return(-EINVAL);
515
516 if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
517 answ = 0;
518 else
519 answ = tp->write_seq - tp->snd_una;
520 break;
521 default:
522 return(-ENOIOCTLCMD);
523 };
524
525 return put_user(answ, (int *)arg);
526 }
527
528
tcp_listen_start(struct sock * sk)529 int tcp_listen_start(struct sock *sk)
530 {
531 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
532 struct tcp_listen_opt *lopt;
533
534 sk->max_ack_backlog = 0;
535 sk->ack_backlog = 0;
536 tp->accept_queue = tp->accept_queue_tail = NULL;
537 tp->syn_wait_lock = RW_LOCK_UNLOCKED;
538 tcp_delack_init(tp);
539
540 lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
541 if (!lopt)
542 return -ENOMEM;
543
544 memset(lopt, 0, sizeof(struct tcp_listen_opt));
545 for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
546 if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
547 break;
548 get_random_bytes(&lopt->hash_rnd, 4);
549
550 write_lock_bh(&tp->syn_wait_lock);
551 tp->listen_opt = lopt;
552 write_unlock_bh(&tp->syn_wait_lock);
553
554 /* There is race window here: we announce ourselves listening,
555 * but this transition is still not validated by get_port().
556 * It is OK, because this socket enters to hash table only
557 * after validation is complete.
558 */
559 sk->state = TCP_LISTEN;
560 if (sk->prot->get_port(sk, sk->num) == 0) {
561 sk->sport = htons(sk->num);
562
563 sk_dst_reset(sk);
564 sk->prot->hash(sk);
565
566 return 0;
567 }
568
569 sk->state = TCP_CLOSE;
570 write_lock_bh(&tp->syn_wait_lock);
571 tp->listen_opt = NULL;
572 write_unlock_bh(&tp->syn_wait_lock);
573 kfree(lopt);
574 return -EADDRINUSE;
575 }
576
577 /*
578 * This routine closes sockets which have been at least partially
579 * opened, but not yet accepted.
580 */
581
tcp_listen_stop(struct sock * sk)582 static void tcp_listen_stop (struct sock *sk)
583 {
584 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
585 struct tcp_listen_opt *lopt = tp->listen_opt;
586 struct open_request *acc_req = tp->accept_queue;
587 struct open_request *req;
588 int i;
589
590 tcp_delete_keepalive_timer(sk);
591
592 /* make all the listen_opt local to us */
593 write_lock_bh(&tp->syn_wait_lock);
594 tp->listen_opt =NULL;
595 write_unlock_bh(&tp->syn_wait_lock);
596 tp->accept_queue = tp->accept_queue_tail = NULL;
597
598 if (lopt->qlen) {
599 for (i=0; i<TCP_SYNQ_HSIZE; i++) {
600 while ((req = lopt->syn_table[i]) != NULL) {
601 lopt->syn_table[i] = req->dl_next;
602 lopt->qlen--;
603 tcp_openreq_free(req);
604
605 /* Following specs, it would be better either to send FIN
606 * (and enter FIN-WAIT-1, it is normal close)
607 * or to send active reset (abort).
608 * Certainly, it is pretty dangerous while synflood, but it is
609 * bad justification for our negligence 8)
610 * To be honest, we are not able to make either
611 * of the variants now. --ANK
612 */
613 }
614 }
615 }
616 BUG_TRAP(lopt->qlen == 0);
617
618 kfree(lopt);
619
620 while ((req=acc_req) != NULL) {
621 struct sock *child = req->sk;
622
623 acc_req = req->dl_next;
624
625 local_bh_disable();
626 bh_lock_sock(child);
627 BUG_TRAP(child->lock.users==0);
628 sock_hold(child);
629
630 tcp_disconnect(child, O_NONBLOCK);
631
632 sock_orphan(child);
633
634 atomic_inc(&tcp_orphan_count);
635
636 tcp_destroy_sock(child);
637
638 bh_unlock_sock(child);
639 local_bh_enable();
640 sock_put(child);
641
642 tcp_acceptq_removed(sk);
643 tcp_openreq_fastfree(req);
644 }
645 BUG_TRAP(sk->ack_backlog == 0);
646 }
647
648 /*
649 * Wait for a socket to get into the connected state
650 *
651 * Note: Must be called with the socket locked.
652 */
wait_for_tcp_connect(struct sock * sk,int flags,long * timeo_p)653 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
654 {
655 struct task_struct *tsk = current;
656 DECLARE_WAITQUEUE(wait, tsk);
657
658 while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
659 if(sk->err)
660 return sock_error(sk);
661 if((1 << sk->state) &
662 ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
663 return -EPIPE;
664 if(!*timeo_p)
665 return -EAGAIN;
666 if(signal_pending(tsk))
667 return sock_intr_errno(*timeo_p);
668
669 __set_task_state(tsk, TASK_INTERRUPTIBLE);
670 add_wait_queue(sk->sleep, &wait);
671 sk->tp_pinfo.af_tcp.write_pending++;
672
673 release_sock(sk);
674 *timeo_p = schedule_timeout(*timeo_p);
675 lock_sock(sk);
676
677 __set_task_state(tsk, TASK_RUNNING);
678 remove_wait_queue(sk->sleep, &wait);
679 sk->tp_pinfo.af_tcp.write_pending--;
680 }
681 return 0;
682 }
683
tcp_memory_free(struct sock * sk)684 static inline int tcp_memory_free(struct sock *sk)
685 {
686 return sk->wmem_queued < sk->sndbuf;
687 }
688
689 /*
690 * Wait for more memory for a socket
691 */
wait_for_tcp_memory(struct sock * sk,long * timeo)692 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
693 {
694 int err = 0;
695 long vm_wait = 0;
696 long current_timeo = *timeo;
697 DECLARE_WAITQUEUE(wait, current);
698
699 if (tcp_memory_free(sk))
700 current_timeo = vm_wait = (net_random()%(HZ/5))+2;
701
702 add_wait_queue(sk->sleep, &wait);
703 for (;;) {
704 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
705
706 set_current_state(TASK_INTERRUPTIBLE);
707
708 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
709 goto do_error;
710 if (!*timeo)
711 goto do_nonblock;
712 if (signal_pending(current))
713 goto do_interrupted;
714 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
715 if (tcp_memory_free(sk) && !vm_wait)
716 break;
717
718 set_bit(SOCK_NOSPACE, &sk->socket->flags);
719 sk->tp_pinfo.af_tcp.write_pending++;
720 release_sock(sk);
721 if (!tcp_memory_free(sk) || vm_wait)
722 current_timeo = schedule_timeout(current_timeo);
723 lock_sock(sk);
724 sk->tp_pinfo.af_tcp.write_pending--;
725
726 if (vm_wait) {
727 vm_wait -= current_timeo;
728 current_timeo = *timeo;
729 if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
730 (current_timeo -= vm_wait) < 0)
731 current_timeo = 0;
732 vm_wait = 0;
733 }
734 *timeo = current_timeo;
735 }
736 out:
737 current->state = TASK_RUNNING;
738 remove_wait_queue(sk->sleep, &wait);
739 return err;
740
741 do_error:
742 err = -EPIPE;
743 goto out;
744 do_nonblock:
745 err = -EAGAIN;
746 goto out;
747 do_interrupted:
748 err = sock_intr_errno(*timeo);
749 goto out;
750 }
751
752 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
753
754 static inline int
can_coalesce(struct sk_buff * skb,int i,struct page * page,int off)755 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
756 {
757 if (i) {
758 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
759 return page == frag->page &&
760 off == frag->page_offset+frag->size;
761 }
762 return 0;
763 }
764
765 static inline void
fill_page_desc(struct sk_buff * skb,int i,struct page * page,int off,int size)766 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
767 {
768 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
769 frag->page = page;
770 frag->page_offset = off;
771 frag->size = size;
772 skb_shinfo(skb)->nr_frags = i+1;
773 }
774
tcp_mark_push(struct tcp_opt * tp,struct sk_buff * skb)775 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
776 {
777 TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
778 tp->pushed_seq = tp->write_seq;
779 }
780
forced_push(struct tcp_opt * tp)781 static inline int forced_push(struct tcp_opt *tp)
782 {
783 return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
784 }
785
786 static inline void
skb_entail(struct sock * sk,struct tcp_opt * tp,struct sk_buff * skb)787 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
788 {
789 skb->csum = 0;
790 TCP_SKB_CB(skb)->seq = tp->write_seq;
791 TCP_SKB_CB(skb)->end_seq = tp->write_seq;
792 TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
793 TCP_SKB_CB(skb)->sacked = 0;
794 __skb_queue_tail(&sk->write_queue, skb);
795 tcp_charge_skb(sk, skb);
796 if (tp->send_head == NULL)
797 tp->send_head = skb;
798 }
799
800 static inline void
tcp_mark_urg(struct tcp_opt * tp,int flags,struct sk_buff * skb)801 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
802 {
803 if (flags & MSG_OOB) {
804 tp->urg_mode = 1;
805 tp->snd_up = tp->write_seq;
806 TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
807 }
808 }
809
810 static inline void
tcp_push(struct sock * sk,struct tcp_opt * tp,int flags,int mss_now,int nonagle)811 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
812 {
813 if (tp->send_head) {
814 struct sk_buff *skb = sk->write_queue.prev;
815 if (!(flags&MSG_MORE) || forced_push(tp))
816 tcp_mark_push(tp, skb);
817 tcp_mark_urg(tp, flags, skb);
818 __tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
819 }
820 }
821
tcp_error(struct sock * sk,int flags,int err)822 static int tcp_error(struct sock *sk, int flags, int err)
823 {
824 if (err == -EPIPE)
825 err = sock_error(sk) ? : -EPIPE;
826 if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
827 send_sig(SIGPIPE, current, 0);
828 return err;
829 }
830
do_tcp_sendpages(struct sock * sk,struct page ** pages,int poffset,size_t psize,int flags)831 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
832 {
833 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
834 int mss_now;
835 int err;
836 ssize_t copied;
837 long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
838
839 /* Wait for a connection to finish. */
840 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
841 if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
842 goto out_err;
843
844 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
845
846 mss_now = tcp_current_mss(sk);
847 copied = 0;
848
849 err = -EPIPE;
850 if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
851 goto do_error;
852
853 while (psize > 0) {
854 struct sk_buff *skb = sk->write_queue.prev;
855 int offset, size, copy, i;
856 struct page *page;
857
858 page = pages[poffset/PAGE_SIZE];
859 offset = poffset % PAGE_SIZE;
860 size = min_t(size_t, psize, PAGE_SIZE-offset);
861
862 if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
863 new_segment:
864 if (!tcp_memory_free(sk))
865 goto wait_for_sndbuf;
866
867 skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
868 if (skb == NULL)
869 goto wait_for_memory;
870
871 skb_entail(sk, tp, skb);
872 copy = mss_now;
873 }
874
875 if (copy > size)
876 copy = size;
877
878 i = skb_shinfo(skb)->nr_frags;
879 if (can_coalesce(skb, i, page, offset)) {
880 skb_shinfo(skb)->frags[i-1].size += copy;
881 } else if (i < MAX_SKB_FRAGS) {
882 get_page(page);
883 fill_page_desc(skb, i, page, offset, copy);
884 } else {
885 tcp_mark_push(tp, skb);
886 goto new_segment;
887 }
888
889 skb->len += copy;
890 skb->data_len += copy;
891 skb->ip_summed = CHECKSUM_HW;
892 tp->write_seq += copy;
893 TCP_SKB_CB(skb)->end_seq += copy;
894
895 if (!copied)
896 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
897
898 copied += copy;
899 poffset += copy;
900 if (!(psize -= copy))
901 goto out;
902
903 if (skb->len != mss_now || (flags&MSG_OOB))
904 continue;
905
906 if (forced_push(tp)) {
907 tcp_mark_push(tp, skb);
908 __tcp_push_pending_frames(sk, tp, mss_now, 1);
909 } else if (skb == tp->send_head)
910 tcp_push_one(sk, mss_now);
911 continue;
912
913 wait_for_sndbuf:
914 set_bit(SOCK_NOSPACE, &sk->socket->flags);
915 wait_for_memory:
916 if (copied)
917 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
918
919 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
920 goto do_error;
921
922 mss_now = tcp_current_mss(sk);
923 }
924
925 out:
926 if (copied)
927 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
928 return copied;
929
930 do_error:
931 if (copied)
932 goto out;
933 out_err:
934 return tcp_error(sk, flags, err);
935 }
936
tcp_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)937 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
938 {
939 ssize_t res;
940 struct sock *sk = sock->sk;
941
942 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
943
944 if (!(sk->route_caps & NETIF_F_SG) ||
945 !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
946 return sock_no_sendpage(sock, page, offset, size, flags);
947
948 #undef TCP_ZC_CSUM_FLAGS
949
950 lock_sock(sk);
951 TCP_CHECK_TIMER(sk);
952 res = do_tcp_sendpages(sk, &page, offset, size, flags);
953 TCP_CHECK_TIMER(sk);
954 release_sock(sk);
955 return res;
956 }
957
958 #define TCP_PAGE(sk) (sk->tp_pinfo.af_tcp.sndmsg_page)
959 #define TCP_OFF(sk) (sk->tp_pinfo.af_tcp.sndmsg_off)
960
961 static inline int
tcp_copy_to_page(struct sock * sk,char * from,struct sk_buff * skb,struct page * page,int off,int copy)962 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
963 struct page *page, int off, int copy)
964 {
965 int err = 0;
966 unsigned int csum;
967
968 csum = csum_and_copy_from_user(from, page_address(page)+off,
969 copy, 0, &err);
970 if (!err) {
971 if (skb->ip_summed == CHECKSUM_NONE)
972 skb->csum = csum_block_add(skb->csum, csum, skb->len);
973 skb->len += copy;
974 skb->data_len += copy;
975 skb->truesize += copy;
976 sk->wmem_queued += copy;
977 sk->forward_alloc -= copy;
978 }
979 return err;
980 }
981
982 static inline int
skb_add_data(struct sk_buff * skb,char * from,int copy)983 skb_add_data(struct sk_buff *skb, char *from, int copy)
984 {
985 int err = 0;
986 unsigned int csum;
987 int off = skb->len;
988
989 csum = csum_and_copy_from_user(from, skb_put(skb, copy),
990 copy, 0, &err);
991 if (!err) {
992 skb->csum = csum_block_add(skb->csum, csum, off);
993 return 0;
994 }
995
996 __skb_trim(skb, off);
997 return -EFAULT;
998 }
999
select_size(struct sock * sk,struct tcp_opt * tp)1000 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1001 {
1002 int tmp = tp->mss_cache;
1003
1004 if (sk->route_caps&NETIF_F_SG) {
1005 int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1006
1007 if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1008 tmp = pgbreak;
1009 }
1010 return tmp;
1011 }
1012
tcp_sendmsg(struct sock * sk,struct msghdr * msg,int size)1013 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1014 {
1015 struct iovec *iov;
1016 struct tcp_opt *tp;
1017 struct sk_buff *skb;
1018 int iovlen, flags;
1019 int mss_now;
1020 int err, copied;
1021 long timeo;
1022
1023 tp = &(sk->tp_pinfo.af_tcp);
1024
1025 lock_sock(sk);
1026 TCP_CHECK_TIMER(sk);
1027
1028 flags = msg->msg_flags;
1029 timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1030
1031 /* Wait for a connection to finish. */
1032 if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1033 if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1034 goto out_err;
1035
1036 /* This should be in poll */
1037 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1038
1039 mss_now = tcp_current_mss(sk);
1040
1041 /* Ok commence sending. */
1042 iovlen = msg->msg_iovlen;
1043 iov = msg->msg_iov;
1044 copied = 0;
1045
1046 err = -EPIPE;
1047 if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1048 goto do_error;
1049
1050 while (--iovlen >= 0) {
1051 int seglen=iov->iov_len;
1052 unsigned char * from=iov->iov_base;
1053
1054 iov++;
1055
1056 while (seglen > 0) {
1057 int copy;
1058
1059 skb = sk->write_queue.prev;
1060
1061 if (tp->send_head == NULL ||
1062 (copy = mss_now - skb->len) <= 0) {
1063
1064 new_segment:
1065 /* Allocate new segment. If the interface is SG,
1066 * allocate skb fitting to single page.
1067 */
1068 if (!tcp_memory_free(sk))
1069 goto wait_for_sndbuf;
1070
1071 skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1072 if (skb == NULL)
1073 goto wait_for_memory;
1074
1075 skb_entail(sk, tp, skb);
1076 copy = mss_now;
1077 }
1078
1079 /* Try to append data to the end of skb. */
1080 if (copy > seglen)
1081 copy = seglen;
1082
1083 /* Where to copy to? */
1084 if (skb_tailroom(skb) > 0) {
1085 /* We have some space in skb head. Superb! */
1086 if (copy > skb_tailroom(skb))
1087 copy = skb_tailroom(skb);
1088 if ((err = skb_add_data(skb, from, copy)) != 0)
1089 goto do_fault;
1090 } else {
1091 int merge = 0;
1092 int i = skb_shinfo(skb)->nr_frags;
1093 struct page *page = TCP_PAGE(sk);
1094 int off = TCP_OFF(sk);
1095
1096 if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1097 /* We can extend the last page fragment. */
1098 merge = 1;
1099 } else if (i == MAX_SKB_FRAGS ||
1100 (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1101 /* Need to add new fragment and cannot
1102 * do this because interface is non-SG,
1103 * or because all the page slots are busy.
1104 */
1105 tcp_mark_push(tp, skb);
1106 goto new_segment;
1107 } else if (page) {
1108 /* If page is cached, align
1109 * offset to L1 cache boundary
1110 */
1111 off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1112 if (off == PAGE_SIZE) {
1113 put_page(page);
1114 TCP_PAGE(sk) = page = NULL;
1115 }
1116 }
1117
1118 if (!page) {
1119 /* Allocate new cache page. */
1120 if (!(page=tcp_alloc_page(sk)))
1121 goto wait_for_memory;
1122 off = 0;
1123 }
1124
1125 if (copy > PAGE_SIZE-off)
1126 copy = PAGE_SIZE-off;
1127
1128 /* Time to copy data. We are close to the end! */
1129 err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1130 if (err) {
1131 /* If this page was new, give it to the
1132 * socket so it does not get leaked.
1133 */
1134 if (TCP_PAGE(sk) == NULL) {
1135 TCP_PAGE(sk) = page;
1136 TCP_OFF(sk) = 0;
1137 }
1138 goto do_error;
1139 }
1140
1141 /* Update the skb. */
1142 if (merge) {
1143 skb_shinfo(skb)->frags[i-1].size += copy;
1144 } else {
1145 fill_page_desc(skb, i, page, off, copy);
1146 if (TCP_PAGE(sk)) {
1147 get_page(page);
1148 } else if (off + copy < PAGE_SIZE) {
1149 get_page(page);
1150 TCP_PAGE(sk) = page;
1151 }
1152 }
1153
1154 TCP_OFF(sk) = off+copy;
1155 }
1156
1157 if (!copied)
1158 TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1159
1160 tp->write_seq += copy;
1161 TCP_SKB_CB(skb)->end_seq += copy;
1162
1163 from += copy;
1164 copied += copy;
1165 if ((seglen -= copy) == 0 && iovlen == 0)
1166 goto out;
1167
1168 if (skb->len != mss_now || (flags&MSG_OOB))
1169 continue;
1170
1171 if (forced_push(tp)) {
1172 tcp_mark_push(tp, skb);
1173 __tcp_push_pending_frames(sk, tp, mss_now, 1);
1174 } else if (skb == tp->send_head)
1175 tcp_push_one(sk, mss_now);
1176 continue;
1177
1178 wait_for_sndbuf:
1179 set_bit(SOCK_NOSPACE, &sk->socket->flags);
1180 wait_for_memory:
1181 if (copied)
1182 tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1183
1184 if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1185 goto do_error;
1186
1187 mss_now = tcp_current_mss(sk);
1188 }
1189 }
1190
1191 out:
1192 if (copied)
1193 tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1194 TCP_CHECK_TIMER(sk);
1195 release_sock(sk);
1196 return copied;
1197
1198 do_fault:
1199 if (skb->len == 0) {
1200 if (tp->send_head == skb)
1201 tp->send_head = NULL;
1202 __skb_unlink(skb, skb->list);
1203 tcp_free_skb(sk, skb);
1204 }
1205
1206 do_error:
1207 if (copied)
1208 goto out;
1209 out_err:
1210 err = tcp_error(sk, flags, err);
1211 TCP_CHECK_TIMER(sk);
1212 release_sock(sk);
1213 return err;
1214 }
1215
1216 /*
1217 * Handle reading urgent data. BSD has very simple semantics for
1218 * this, no blocking and very strange errors 8)
1219 */
1220
tcp_recv_urg(struct sock * sk,long timeo,struct msghdr * msg,int len,int flags,int * addr_len)1221 static int tcp_recv_urg(struct sock * sk, long timeo,
1222 struct msghdr *msg, int len, int flags,
1223 int *addr_len)
1224 {
1225 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1226
1227 /* No URG data to read. */
1228 if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1229 return -EINVAL; /* Yes this is right ! */
1230
1231 if (sk->state==TCP_CLOSE && !sk->done)
1232 return -ENOTCONN;
1233
1234 if (tp->urg_data & TCP_URG_VALID) {
1235 int err = 0;
1236 char c = tp->urg_data;
1237
1238 if (!(flags & MSG_PEEK))
1239 tp->urg_data = TCP_URG_READ;
1240
1241 /* Read urgent data. */
1242 msg->msg_flags|=MSG_OOB;
1243
1244 if(len>0) {
1245 if (!(flags & MSG_TRUNC))
1246 err = memcpy_toiovec(msg->msg_iov, &c, 1);
1247 len = 1;
1248 } else
1249 msg->msg_flags|=MSG_TRUNC;
1250
1251 return err ? -EFAULT : len;
1252 }
1253
1254 if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1255 return 0;
1256
1257 /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and
1258 * the available implementations agree in this case:
1259 * this call should never block, independent of the
1260 * blocking state of the socket.
1261 * Mike <pall@rz.uni-karlsruhe.de>
1262 */
1263 return -EAGAIN;
1264 }
1265
1266 /*
1267 * Release a skb if it is no longer needed. This routine
1268 * must be called with interrupts disabled or with the
1269 * socket locked so that the sk_buff queue operation is ok.
1270 */
1271
tcp_eat_skb(struct sock * sk,struct sk_buff * skb)1272 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1273 {
1274 __skb_unlink(skb, &sk->receive_queue);
1275 __kfree_skb(skb);
1276 }
1277
1278 /* Clean up the receive buffer for full frames taken by the user,
1279 * then send an ACK if necessary. COPIED is the number of bytes
1280 * tcp_recvmsg has given to the user so far, it speeds up the
1281 * calculation of whether or not we must ACK for the sake of
1282 * a window update.
1283 */
cleanup_rbuf(struct sock * sk,int copied)1284 static void cleanup_rbuf(struct sock *sk, int copied)
1285 {
1286 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1287 int time_to_ack = 0;
1288
1289 #if TCP_DEBUG
1290 struct sk_buff *skb = skb_peek(&sk->receive_queue);
1291
1292 BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1293 #endif
1294
1295 if (tcp_ack_scheduled(tp)) {
1296 /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1297 if (tp->ack.blocked
1298 /* Once-per-two-segments ACK was not sent by tcp_input.c */
1299 || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1300 /*
1301 * If this read emptied read buffer, we send ACK, if
1302 * connection is not bidirectional, user drained
1303 * receive buffer and there was a small segment
1304 * in queue.
1305 */
1306 || (copied > 0 &&
1307 (tp->ack.pending&TCP_ACK_PUSHED) &&
1308 !tp->ack.pingpong &&
1309 atomic_read(&sk->rmem_alloc) == 0)) {
1310 time_to_ack = 1;
1311 }
1312 }
1313
1314 /* We send an ACK if we can now advertise a non-zero window
1315 * which has been raised "significantly".
1316 *
1317 * Even if window raised up to infinity, do not send window open ACK
1318 * in states, where we will not receive more. It is useless.
1319 */
1320 if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1321 __u32 rcv_window_now = tcp_receive_window(tp);
1322
1323 /* Optimize, __tcp_select_window() is not cheap. */
1324 if (2*rcv_window_now <= tp->window_clamp) {
1325 __u32 new_window = __tcp_select_window(sk);
1326
1327 /* Send ACK now, if this read freed lots of space
1328 * in our buffer. Certainly, new_window is new window.
1329 * We can advertise it now, if it is not less than current one.
1330 * "Lots" means "at least twice" here.
1331 */
1332 if(new_window && new_window >= 2*rcv_window_now)
1333 time_to_ack = 1;
1334 }
1335 }
1336 if (time_to_ack)
1337 tcp_send_ack(sk);
1338 }
1339
1340 /* Now socket state including sk->err is changed only under lock,
1341 * hence we may omit checks after joining wait queue.
1342 * We check receive queue before schedule() only as optimization;
1343 * it is very likely that release_sock() added new data.
1344 */
1345
tcp_data_wait(struct sock * sk,long timeo)1346 static long tcp_data_wait(struct sock *sk, long timeo)
1347 {
1348 DECLARE_WAITQUEUE(wait, current);
1349
1350 add_wait_queue(sk->sleep, &wait);
1351
1352 __set_current_state(TASK_INTERRUPTIBLE);
1353
1354 set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1355 release_sock(sk);
1356
1357 if (skb_queue_empty(&sk->receive_queue))
1358 timeo = schedule_timeout(timeo);
1359
1360 lock_sock(sk);
1361 clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1362
1363 remove_wait_queue(sk->sleep, &wait);
1364 __set_current_state(TASK_RUNNING);
1365 return timeo;
1366 }
1367
tcp_prequeue_process(struct sock * sk)1368 static void tcp_prequeue_process(struct sock *sk)
1369 {
1370 struct sk_buff *skb;
1371 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1372
1373 net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1374
1375 /* RX process wants to run with disabled BHs, though it is not necessary */
1376 local_bh_disable();
1377 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1378 sk->backlog_rcv(sk, skb);
1379 local_bh_enable();
1380
1381 /* Clear memory counter. */
1382 tp->ucopy.memory = 0;
1383 }
1384
1385 static inline
tcp_recv_skb(struct sock * sk,u32 seq,u32 * off)1386 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1387 {
1388 struct sk_buff *skb;
1389 u32 offset;
1390
1391 skb_queue_walk(&sk->receive_queue, skb) {
1392 offset = seq - TCP_SKB_CB(skb)->seq;
1393 if (skb->h.th->syn)
1394 offset--;
1395 if (offset < skb->len || skb->h.th->fin) {
1396 *off = offset;
1397 return skb;
1398 }
1399 }
1400 return NULL;
1401 }
1402
1403 /*
1404 * This routine provides an alternative to tcp_recvmsg() for routines
1405 * that would like to handle copying from skbuffs directly in 'sendfile'
1406 * fashion.
1407 * Note:
1408 * - It is assumed that the socket was locked by the caller.
1409 * - The routine does not block.
1410 * - At present, there is no support for reading OOB data
1411 * or for 'peeking' the socket using this routine
1412 * (although both would be easy to implement).
1413 */
tcp_read_sock(struct sock * sk,read_descriptor_t * desc,sk_read_actor_t recv_actor)1414 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1415 sk_read_actor_t recv_actor)
1416 {
1417 struct sk_buff *skb;
1418 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1419 u32 seq = tp->copied_seq;
1420 u32 offset;
1421 int copied = 0;
1422
1423 if (sk->state == TCP_LISTEN)
1424 return -ENOTCONN;
1425 while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1426 if (offset < skb->len) {
1427 size_t used, len;
1428
1429 len = skb->len - offset;
1430 /* Stop reading if we hit a patch of urgent data */
1431 if (tp->urg_data) {
1432 u32 urg_offset = tp->urg_seq - seq;
1433 if (urg_offset < len)
1434 len = urg_offset;
1435 if (!len)
1436 break;
1437 }
1438 used = recv_actor(desc, skb, offset, len);
1439 if (used <= len) {
1440 seq += used;
1441 copied += used;
1442 offset += used;
1443 }
1444 if (offset != skb->len)
1445 break;
1446 }
1447 if (skb->h.th->fin) {
1448 tcp_eat_skb(sk, skb);
1449 ++seq;
1450 break;
1451 }
1452 tcp_eat_skb(sk, skb);
1453 if (!desc->count)
1454 break;
1455 }
1456 tp->copied_seq = seq;
1457
1458 tcp_rcv_space_adjust(sk);
1459
1460 /* Clean up data we have read: This will do ACK frames. */
1461 if (copied)
1462 cleanup_rbuf(sk, copied);
1463 return copied;
1464 }
1465
1466 /*
1467 * This routine copies from a sock struct into the user buffer.
1468 *
1469 * Technical note: in 2.3 we work on _locked_ socket, so that
1470 * tricks with *seq access order and skb->users are not required.
1471 * Probably, code can be easily improved even more.
1472 */
1473
tcp_recvmsg(struct sock * sk,struct msghdr * msg,int len,int nonblock,int flags,int * addr_len)1474 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1475 int len, int nonblock, int flags, int *addr_len)
1476 {
1477 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1478 int copied = 0;
1479 u32 peek_seq;
1480 u32 *seq;
1481 unsigned long used;
1482 int err;
1483 int target; /* Read at least this many bytes */
1484 long timeo;
1485 struct task_struct *user_recv = NULL;
1486
1487 lock_sock(sk);
1488
1489 TCP_CHECK_TIMER(sk);
1490
1491 err = -ENOTCONN;
1492 if (sk->state == TCP_LISTEN)
1493 goto out;
1494
1495 timeo = sock_rcvtimeo(sk, nonblock);
1496
1497 /* Urgent data needs to be handled specially. */
1498 if (flags & MSG_OOB)
1499 goto recv_urg;
1500
1501 seq = &tp->copied_seq;
1502 if (flags & MSG_PEEK) {
1503 peek_seq = tp->copied_seq;
1504 seq = &peek_seq;
1505 }
1506
1507 target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1508
1509 do {
1510 struct sk_buff * skb;
1511 u32 offset;
1512
1513 /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1514 if (tp->urg_data && tp->urg_seq == *seq) {
1515 if (copied)
1516 break;
1517 if (signal_pending(current)) {
1518 copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1519 break;
1520 }
1521 }
1522
1523 /* Next get a buffer. */
1524
1525 skb = skb_peek(&sk->receive_queue);
1526 do {
1527 if (!skb)
1528 break;
1529
1530 /* Now that we have two receive queues this
1531 * shouldn't happen.
1532 */
1533 if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1534 printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1535 *seq, TCP_SKB_CB(skb)->seq);
1536 break;
1537 }
1538 offset = *seq - TCP_SKB_CB(skb)->seq;
1539 if (skb->h.th->syn)
1540 offset--;
1541 if (offset < skb->len)
1542 goto found_ok_skb;
1543 if (skb->h.th->fin)
1544 goto found_fin_ok;
1545 BUG_TRAP(flags&MSG_PEEK);
1546 skb = skb->next;
1547 } while (skb != (struct sk_buff *)&sk->receive_queue);
1548
1549 /* Well, if we have backlog, try to process it now yet. */
1550
1551 if (copied >= target && sk->backlog.tail == NULL)
1552 break;
1553
1554 if (copied) {
1555 if (sk->err ||
1556 sk->state == TCP_CLOSE ||
1557 (sk->shutdown & RCV_SHUTDOWN) ||
1558 !timeo ||
1559 signal_pending(current) ||
1560 (flags & MSG_PEEK))
1561 break;
1562 } else {
1563 if (sk->done)
1564 break;
1565
1566 if (sk->err) {
1567 copied = sock_error(sk);
1568 break;
1569 }
1570
1571 if (sk->shutdown & RCV_SHUTDOWN)
1572 break;
1573
1574 if (sk->state == TCP_CLOSE) {
1575 if (!sk->done) {
1576 /* This occurs when user tries to read
1577 * from never connected socket.
1578 */
1579 copied = -ENOTCONN;
1580 break;
1581 }
1582 break;
1583 }
1584
1585 if (!timeo) {
1586 copied = -EAGAIN;
1587 break;
1588 }
1589
1590 if (signal_pending(current)) {
1591 copied = sock_intr_errno(timeo);
1592 break;
1593 }
1594 }
1595
1596 cleanup_rbuf(sk, copied);
1597
1598 if (tp->ucopy.task == user_recv) {
1599 /* Install new reader */
1600 if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1601 user_recv = current;
1602 tp->ucopy.task = user_recv;
1603 tp->ucopy.iov = msg->msg_iov;
1604 }
1605
1606 tp->ucopy.len = len;
1607
1608 BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1609
1610 /* Ugly... If prequeue is not empty, we have to
1611 * process it before releasing socket, otherwise
1612 * order will be broken at second iteration.
1613 * More elegant solution is required!!!
1614 *
1615 * Look: we have the following (pseudo)queues:
1616 *
1617 * 1. packets in flight
1618 * 2. backlog
1619 * 3. prequeue
1620 * 4. receive_queue
1621 *
1622 * Each queue can be processed only if the next ones
1623 * are empty. At this point we have empty receive_queue.
1624 * But prequeue _can_ be not empty after second iteration,
1625 * when we jumped to start of loop because backlog
1626 * processing added something to receive_queue.
1627 * We cannot release_sock(), because backlog contains
1628 * packets arrived _after_ prequeued ones.
1629 *
1630 * Shortly, algorithm is clear --- to process all
1631 * the queues in order. We could make it more directly,
1632 * requeueing packets from backlog to prequeue, if
1633 * is not empty. It is more elegant, but eats cycles,
1634 * unfortunately.
1635 */
1636 if (skb_queue_len(&tp->ucopy.prequeue))
1637 goto do_prequeue;
1638
1639 /* __ Set realtime policy in scheduler __ */
1640 }
1641
1642 if (copied >= target) {
1643 /* Do not sleep, just process backlog. */
1644 release_sock(sk);
1645 lock_sock(sk);
1646 } else {
1647 timeo = tcp_data_wait(sk, timeo);
1648 }
1649
1650 if (user_recv) {
1651 int chunk;
1652
1653 /* __ Restore normal policy in scheduler __ */
1654
1655 if ((chunk = len - tp->ucopy.len) != 0) {
1656 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1657 len -= chunk;
1658 copied += chunk;
1659 }
1660
1661 if (tp->rcv_nxt == tp->copied_seq &&
1662 skb_queue_len(&tp->ucopy.prequeue)) {
1663 do_prequeue:
1664 tcp_prequeue_process(sk);
1665
1666 if ((chunk = len - tp->ucopy.len) != 0) {
1667 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1668 len -= chunk;
1669 copied += chunk;
1670 }
1671 }
1672 }
1673 if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1674 if (net_ratelimit())
1675 printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1676 current->comm, current->pid);
1677 peek_seq = tp->copied_seq;
1678 }
1679 continue;
1680
1681 found_ok_skb:
1682 /* Ok so how much can we use? */
1683 used = skb->len - offset;
1684 if (len < used)
1685 used = len;
1686
1687 /* Do we have urgent data here? */
1688 if (tp->urg_data) {
1689 u32 urg_offset = tp->urg_seq - *seq;
1690 if (urg_offset < used) {
1691 if (!urg_offset) {
1692 if (!sk->urginline) {
1693 ++*seq;
1694 offset++;
1695 used--;
1696 if (!used)
1697 goto skip_copy;
1698 }
1699 } else
1700 used = urg_offset;
1701 }
1702 }
1703
1704 if (!(flags&MSG_TRUNC)) {
1705 err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1706 if (err) {
1707 /* Exception. Bailout! */
1708 if (!copied)
1709 copied = -EFAULT;
1710 break;
1711 }
1712 }
1713
1714 *seq += used;
1715 copied += used;
1716 len -= used;
1717
1718 tcp_rcv_space_adjust(sk);
1719
1720 skip_copy:
1721 if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1722 tp->urg_data = 0;
1723 tcp_fast_path_check(sk, tp);
1724 }
1725 if (used + offset < skb->len)
1726 continue;
1727
1728 if (skb->h.th->fin)
1729 goto found_fin_ok;
1730 if (!(flags & MSG_PEEK))
1731 tcp_eat_skb(sk, skb);
1732 continue;
1733
1734 found_fin_ok:
1735 /* Process the FIN. */
1736 ++*seq;
1737 if (!(flags & MSG_PEEK))
1738 tcp_eat_skb(sk, skb);
1739 break;
1740 } while (len > 0);
1741
1742 if (user_recv) {
1743 if (skb_queue_len(&tp->ucopy.prequeue)) {
1744 int chunk;
1745
1746 tp->ucopy.len = copied > 0 ? len : 0;
1747
1748 tcp_prequeue_process(sk);
1749
1750 if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1751 net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1752 len -= chunk;
1753 copied += chunk;
1754 }
1755 }
1756
1757 tp->ucopy.task = NULL;
1758 tp->ucopy.len = 0;
1759 }
1760
1761 /* According to UNIX98, msg_name/msg_namelen are ignored
1762 * on connected socket. I was just happy when found this 8) --ANK
1763 */
1764
1765 /* Clean up data we have read: This will do ACK frames. */
1766 cleanup_rbuf(sk, copied);
1767
1768 TCP_CHECK_TIMER(sk);
1769 release_sock(sk);
1770 return copied;
1771
1772 out:
1773 TCP_CHECK_TIMER(sk);
1774 release_sock(sk);
1775 return err;
1776
1777 recv_urg:
1778 err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1779 goto out;
1780 }
1781
1782 /*
1783 * State processing on a close. This implements the state shift for
1784 * sending our FIN frame. Note that we only send a FIN for some
1785 * states. A shutdown() may have already sent the FIN, or we may be
1786 * closed.
1787 */
1788
1789 static unsigned char new_state[16] = {
1790 /* current state: new state: action: */
1791 /* (Invalid) */ TCP_CLOSE,
1792 /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1793 /* TCP_SYN_SENT */ TCP_CLOSE,
1794 /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1795 /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1,
1796 /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2,
1797 /* TCP_TIME_WAIT */ TCP_CLOSE,
1798 /* TCP_CLOSE */ TCP_CLOSE,
1799 /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN,
1800 /* TCP_LAST_ACK */ TCP_LAST_ACK,
1801 /* TCP_LISTEN */ TCP_CLOSE,
1802 /* TCP_CLOSING */ TCP_CLOSING,
1803 };
1804
tcp_close_state(struct sock * sk)1805 static int tcp_close_state(struct sock *sk)
1806 {
1807 int next = (int) new_state[sk->state];
1808 int ns = (next & TCP_STATE_MASK);
1809
1810 tcp_set_state(sk, ns);
1811
1812 return (next & TCP_ACTION_FIN);
1813 }
1814
1815 /*
1816 * Shutdown the sending side of a connection. Much like close except
1817 * that we don't receive shut down or set sk->dead.
1818 */
1819
tcp_shutdown(struct sock * sk,int how)1820 void tcp_shutdown(struct sock *sk, int how)
1821 {
1822 /* We need to grab some memory, and put together a FIN,
1823 * and then put it into the queue to be sent.
1824 * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1825 */
1826 if (!(how & SEND_SHUTDOWN))
1827 return;
1828
1829 /* If we've already sent a FIN, or it's a closed state, skip this. */
1830 if ((1 << sk->state) &
1831 (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1832 /* Clear out any half completed packets. FIN if needed. */
1833 if (tcp_close_state(sk))
1834 tcp_send_fin(sk);
1835 }
1836 }
1837
1838
1839 /*
1840 * Return 1 if we still have things to send in our buffers.
1841 */
1842
closing(struct sock * sk)1843 static inline int closing(struct sock * sk)
1844 {
1845 return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1846 }
1847
tcp_kill_sk_queues(struct sock * sk)1848 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1849 {
1850 /* First the read buffer. */
1851 __skb_queue_purge(&sk->receive_queue);
1852
1853 /* Next, the error queue. */
1854 __skb_queue_purge(&sk->error_queue);
1855
1856 /* Next, the write queue. */
1857 BUG_TRAP(skb_queue_empty(&sk->write_queue));
1858
1859 /* Account for returned memory. */
1860 tcp_mem_reclaim(sk);
1861
1862 BUG_TRAP(sk->wmem_queued == 0);
1863 BUG_TRAP(sk->forward_alloc == 0);
1864
1865 /* It is _impossible_ for the backlog to contain anything
1866 * when we get here. All user references to this socket
1867 * have gone away, only the net layer knows can touch it.
1868 */
1869 }
1870
1871 /*
1872 * At this point, there should be no process reference to this
1873 * socket, and thus no user references at all. Therefore we
1874 * can assume the socket waitqueue is inactive and nobody will
1875 * try to jump onto it.
1876 */
tcp_destroy_sock(struct sock * sk)1877 void tcp_destroy_sock(struct sock *sk)
1878 {
1879 BUG_TRAP(sk->state==TCP_CLOSE);
1880 BUG_TRAP(sk->dead);
1881
1882 /* It cannot be in hash table! */
1883 BUG_TRAP(sk->pprev==NULL);
1884
1885 /* If it has not 0 sk->num, it must be bound */
1886 BUG_TRAP(!sk->num || sk->prev!=NULL);
1887
1888 #ifdef TCP_DEBUG
1889 if (sk->zapped) {
1890 printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1891 sock_hold(sk);
1892 }
1893 sk->zapped = 1;
1894 #endif
1895
1896 sk->prot->destroy(sk);
1897
1898 tcp_kill_sk_queues(sk);
1899
1900 #ifdef INET_REFCNT_DEBUG
1901 if (atomic_read(&sk->refcnt) != 1) {
1902 printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1903 }
1904 #endif
1905
1906 atomic_dec(&tcp_orphan_count);
1907 sock_put(sk);
1908 }
1909
tcp_close(struct sock * sk,long timeout)1910 void tcp_close(struct sock *sk, long timeout)
1911 {
1912 struct sk_buff *skb;
1913 int data_was_unread = 0;
1914
1915 lock_sock(sk);
1916 sk->shutdown = SHUTDOWN_MASK;
1917
1918 if(sk->state == TCP_LISTEN) {
1919 tcp_set_state(sk, TCP_CLOSE);
1920
1921 /* Special case. */
1922 tcp_listen_stop(sk);
1923
1924 goto adjudge_to_death;
1925 }
1926
1927 /* We need to flush the recv. buffs. We do this only on the
1928 * descriptor close, not protocol-sourced closes, because the
1929 * reader process may not have drained the data yet!
1930 */
1931 while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1932 u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1933 data_was_unread += len;
1934 __kfree_skb(skb);
1935 }
1936
1937 tcp_mem_reclaim(sk);
1938
1939 /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1940 * 3.10, we send a RST here because data was lost. To
1941 * witness the awful effects of the old behavior of always
1942 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1943 * a bulk GET in an FTP client, suspend the process, wait
1944 * for the client to advertise a zero window, then kill -9
1945 * the FTP client, wheee... Note: timeout is always zero
1946 * in such a case.
1947 */
1948 if(data_was_unread != 0) {
1949 /* Unread data was tossed, zap the connection. */
1950 NET_INC_STATS_USER(TCPAbortOnClose);
1951 tcp_set_state(sk, TCP_CLOSE);
1952 tcp_send_active_reset(sk, GFP_KERNEL);
1953 } else if (sk->linger && sk->lingertime==0) {
1954 /* Check zero linger _after_ checking for unread data. */
1955 sk->prot->disconnect(sk, 0);
1956 NET_INC_STATS_USER(TCPAbortOnData);
1957 } else if (tcp_close_state(sk)) {
1958 /* We FIN if the application ate all the data before
1959 * zapping the connection.
1960 */
1961
1962 /* RED-PEN. Formally speaking, we have broken TCP state
1963 * machine. State transitions:
1964 *
1965 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1966 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible)
1967 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1968 *
1969 * are legal only when FIN has been sent (i.e. in window),
1970 * rather than queued out of window. Purists blame.
1971 *
1972 * F.e. "RFC state" is ESTABLISHED,
1973 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1974 *
1975 * The visible declinations are that sometimes
1976 * we enter time-wait state, when it is not required really
1977 * (harmless), do not send active resets, when they are
1978 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1979 * they look as CLOSING or LAST_ACK for Linux)
1980 * Probably, I missed some more holelets.
1981 * --ANK
1982 */
1983 tcp_send_fin(sk);
1984 }
1985
1986 if (timeout) {
1987 struct task_struct *tsk = current;
1988 DECLARE_WAITQUEUE(wait, current);
1989
1990 add_wait_queue(sk->sleep, &wait);
1991
1992 do {
1993 set_current_state(TASK_INTERRUPTIBLE);
1994 if (!closing(sk))
1995 break;
1996 release_sock(sk);
1997 timeout = schedule_timeout(timeout);
1998 lock_sock(sk);
1999 } while (!signal_pending(tsk) && timeout);
2000
2001 tsk->state = TASK_RUNNING;
2002 remove_wait_queue(sk->sleep, &wait);
2003 }
2004
2005 adjudge_to_death:
2006 /* It is the last release_sock in its life. It will remove backlog. */
2007 release_sock(sk);
2008
2009
2010 /* Now socket is owned by kernel and we acquire BH lock
2011 to finish close. No need to check for user refs.
2012 */
2013 local_bh_disable();
2014 bh_lock_sock(sk);
2015 BUG_TRAP(sk->lock.users==0);
2016
2017 sock_hold(sk);
2018 sock_orphan(sk);
2019
2020 /* This is a (useful) BSD violating of the RFC. There is a
2021 * problem with TCP as specified in that the other end could
2022 * keep a socket open forever with no application left this end.
2023 * We use a 3 minute timeout (about the same as BSD) then kill
2024 * our end. If they send after that then tough - BUT: long enough
2025 * that we won't make the old 4*rto = almost no time - whoops
2026 * reset mistake.
2027 *
2028 * Nope, it was not mistake. It is really desired behaviour
2029 * f.e. on http servers, when such sockets are useless, but
2030 * consume significant resources. Let's do it with special
2031 * linger2 option. --ANK
2032 */
2033
2034 if (sk->state == TCP_FIN_WAIT2) {
2035 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2036 if (tp->linger2 < 0) {
2037 tcp_set_state(sk, TCP_CLOSE);
2038 tcp_send_active_reset(sk, GFP_ATOMIC);
2039 NET_INC_STATS_BH(TCPAbortOnLinger);
2040 } else {
2041 int tmo = tcp_fin_time(tp);
2042
2043 if (tmo > TCP_TIMEWAIT_LEN) {
2044 tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2045 } else {
2046 atomic_inc(&tcp_orphan_count);
2047 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2048 goto out;
2049 }
2050 }
2051 }
2052 if (sk->state != TCP_CLOSE) {
2053 tcp_mem_reclaim(sk);
2054 if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2055 (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2056 atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2057 if (net_ratelimit())
2058 printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2059 tcp_set_state(sk, TCP_CLOSE);
2060 tcp_send_active_reset(sk, GFP_ATOMIC);
2061 NET_INC_STATS_BH(TCPAbortOnMemory);
2062 }
2063 }
2064 atomic_inc(&tcp_orphan_count);
2065
2066 if (sk->state == TCP_CLOSE)
2067 tcp_destroy_sock(sk);
2068 /* Otherwise, socket is reprieved until protocol close. */
2069
2070 out:
2071 bh_unlock_sock(sk);
2072 local_bh_enable();
2073 sock_put(sk);
2074 }
2075
2076 /* These states need RST on ABORT according to RFC793 */
2077
tcp_need_reset(int state)2078 static inline int tcp_need_reset(int state)
2079 {
2080 return ((1 << state) &
2081 (TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2082 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2083 }
2084
tcp_disconnect(struct sock * sk,int flags)2085 int tcp_disconnect(struct sock *sk, int flags)
2086 {
2087 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2088 int old_state;
2089 int err = 0;
2090
2091 old_state = sk->state;
2092 if (old_state != TCP_CLOSE)
2093 tcp_set_state(sk, TCP_CLOSE);
2094
2095 /* ABORT function of RFC793 */
2096 if (old_state == TCP_LISTEN) {
2097 tcp_listen_stop(sk);
2098 } else if (tcp_need_reset(old_state) ||
2099 (tp->snd_nxt != tp->write_seq &&
2100 (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2101 /* The last check adjusts for discrepance of Linux wrt. RFC
2102 * states
2103 */
2104 tcp_send_active_reset(sk, gfp_any());
2105 sk->err = ECONNRESET;
2106 } else if (old_state == TCP_SYN_SENT)
2107 sk->err = ECONNRESET;
2108
2109 tcp_clear_xmit_timers(sk);
2110 __skb_queue_purge(&sk->receive_queue);
2111 tcp_writequeue_purge(sk);
2112 __skb_queue_purge(&tp->out_of_order_queue);
2113
2114 sk->dport = 0;
2115
2116 if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2117 sk->rcv_saddr = 0;
2118 sk->saddr = 0;
2119 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2120 memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2121 memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2122 #endif
2123 }
2124
2125 sk->shutdown = 0;
2126 sk->done = 0;
2127 tp->srtt = 0;
2128 if ((tp->write_seq += tp->max_window+2) == 0)
2129 tp->write_seq = 1;
2130 tp->backoff = 0;
2131 tp->snd_cwnd = 2;
2132 tp->probes_out = 0;
2133 tp->packets_out = 0;
2134 tp->snd_ssthresh = 0x7fffffff;
2135 tp->snd_cwnd_cnt = 0;
2136 tcp_set_ca_state(tp, TCP_CA_Open);
2137 tcp_clear_retrans(tp);
2138 tcp_delack_init(tp);
2139 tp->send_head = NULL;
2140 tp->saw_tstamp = 0;
2141 tcp_sack_reset(tp);
2142 __sk_dst_reset(sk);
2143
2144 BUG_TRAP(!sk->num || sk->prev);
2145
2146 sk->error_report(sk);
2147 return err;
2148 }
2149
2150 /*
2151 * Wait for an incoming connection, avoid race
2152 * conditions. This must be called with the socket locked.
2153 */
wait_for_connect(struct sock * sk,long timeo)2154 static int wait_for_connect(struct sock * sk, long timeo)
2155 {
2156 DECLARE_WAITQUEUE(wait, current);
2157 int err;
2158
2159 /*
2160 * True wake-one mechanism for incoming connections: only
2161 * one process gets woken up, not the 'whole herd'.
2162 * Since we do not 'race & poll' for established sockets
2163 * anymore, the common case will execute the loop only once.
2164 *
2165 * Subtle issue: "add_wait_queue_exclusive()" will be added
2166 * after any current non-exclusive waiters, and we know that
2167 * it will always _stay_ after any new non-exclusive waiters
2168 * because all non-exclusive waiters are added at the
2169 * beginning of the wait-queue. As such, it's ok to "drop"
2170 * our exclusiveness temporarily when we get woken up without
2171 * having to remove and re-insert us on the wait queue.
2172 */
2173 add_wait_queue_exclusive(sk->sleep, &wait);
2174 for (;;) {
2175 current->state = TASK_INTERRUPTIBLE;
2176 release_sock(sk);
2177 if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2178 timeo = schedule_timeout(timeo);
2179 lock_sock(sk);
2180 err = 0;
2181 if (sk->tp_pinfo.af_tcp.accept_queue)
2182 break;
2183 err = -EINVAL;
2184 if (sk->state != TCP_LISTEN)
2185 break;
2186 err = sock_intr_errno(timeo);
2187 if (signal_pending(current))
2188 break;
2189 err = -EAGAIN;
2190 if (!timeo)
2191 break;
2192 }
2193 current->state = TASK_RUNNING;
2194 remove_wait_queue(sk->sleep, &wait);
2195 return err;
2196 }
2197
2198 /*
2199 * This will accept the next outstanding connection.
2200 */
2201
tcp_accept(struct sock * sk,int flags,int * err)2202 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2203 {
2204 struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2205 struct open_request *req;
2206 struct sock *newsk;
2207 int error;
2208
2209 lock_sock(sk);
2210
2211 /* We need to make sure that this socket is listening,
2212 * and that it has something pending.
2213 */
2214 error = -EINVAL;
2215 if (sk->state != TCP_LISTEN)
2216 goto out;
2217
2218 /* Find already established connection */
2219 if (!tp->accept_queue) {
2220 long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2221
2222 /* If this is a non blocking socket don't sleep */
2223 error = -EAGAIN;
2224 if (!timeo)
2225 goto out;
2226
2227 error = wait_for_connect(sk, timeo);
2228 if (error)
2229 goto out;
2230 }
2231
2232 req = tp->accept_queue;
2233 if ((tp->accept_queue = req->dl_next) == NULL)
2234 tp->accept_queue_tail = NULL;
2235
2236 newsk = req->sk;
2237 tcp_acceptq_removed(sk);
2238 tcp_openreq_fastfree(req);
2239 BUG_TRAP(newsk->state != TCP_SYN_RECV);
2240 release_sock(sk);
2241 return newsk;
2242
2243 out:
2244 release_sock(sk);
2245 *err = error;
2246 return NULL;
2247 }
2248
2249 /*
2250 * Socket option code for TCP.
2251 */
2252
tcp_setsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)2253 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2254 int optlen)
2255 {
2256 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2257 int val;
2258 int err = 0;
2259
2260 if (level != SOL_TCP)
2261 return tp->af_specific->setsockopt(sk, level, optname,
2262 optval, optlen);
2263
2264 if(optlen<sizeof(int))
2265 return -EINVAL;
2266
2267 if (get_user(val, (int *)optval))
2268 return -EFAULT;
2269
2270 lock_sock(sk);
2271
2272 switch(optname) {
2273 case TCP_MAXSEG:
2274 /* values greater than interface MTU won't take effect. however at
2275 * the point when this call is done we typically don't yet know
2276 * which interface is going to be used
2277 */
2278 if(val < 8 || val > MAX_TCP_WINDOW) {
2279 err = -EINVAL;
2280 break;
2281 }
2282 tp->user_mss = val;
2283 break;
2284
2285 case TCP_NODELAY:
2286 /* You cannot try to use this and TCP_CORK in
2287 * tandem, so let the user know.
2288 */
2289 if (tp->nonagle == 2) {
2290 err = -EINVAL;
2291 break;
2292 }
2293 tp->nonagle = (val == 0) ? 0 : 1;
2294 if (val)
2295 tcp_push_pending_frames(sk, tp);
2296 break;
2297
2298 case TCP_CORK:
2299 /* When set indicates to always queue non-full frames.
2300 * Later the user clears this option and we transmit
2301 * any pending partial frames in the queue. This is
2302 * meant to be used alongside sendfile() to get properly
2303 * filled frames when the user (for example) must write
2304 * out headers with a write() call first and then use
2305 * sendfile to send out the data parts.
2306 *
2307 * You cannot try to use TCP_NODELAY and this mechanism
2308 * at the same time, so let the user know.
2309 */
2310 if (tp->nonagle == 1) {
2311 err = -EINVAL;
2312 break;
2313 }
2314 if (val != 0) {
2315 tp->nonagle = 2;
2316 } else {
2317 tp->nonagle = 0;
2318
2319 tcp_push_pending_frames(sk, tp);
2320 }
2321 break;
2322
2323 case TCP_KEEPIDLE:
2324 if (val < 1 || val > MAX_TCP_KEEPIDLE)
2325 err = -EINVAL;
2326 else {
2327 tp->keepalive_time = val * HZ;
2328 if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2329 __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2330 if (tp->keepalive_time > elapsed)
2331 elapsed = tp->keepalive_time - elapsed;
2332 else
2333 elapsed = 0;
2334 tcp_reset_keepalive_timer(sk, elapsed);
2335 }
2336 }
2337 break;
2338 case TCP_KEEPINTVL:
2339 if (val < 1 || val > MAX_TCP_KEEPINTVL)
2340 err = -EINVAL;
2341 else
2342 tp->keepalive_intvl = val * HZ;
2343 break;
2344 case TCP_KEEPCNT:
2345 if (val < 1 || val > MAX_TCP_KEEPCNT)
2346 err = -EINVAL;
2347 else
2348 tp->keepalive_probes = val;
2349 break;
2350 case TCP_SYNCNT:
2351 if (val < 1 || val > MAX_TCP_SYNCNT)
2352 err = -EINVAL;
2353 else
2354 tp->syn_retries = val;
2355 break;
2356
2357 case TCP_LINGER2:
2358 if (val < 0)
2359 tp->linger2 = -1;
2360 else if (val > sysctl_tcp_fin_timeout/HZ)
2361 tp->linger2 = 0;
2362 else
2363 tp->linger2 = val*HZ;
2364 break;
2365
2366 case TCP_DEFER_ACCEPT:
2367 tp->defer_accept = 0;
2368 if (val > 0) {
2369 /* Translate value in seconds to number of retransmits */
2370 while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2371 tp->defer_accept++;
2372 tp->defer_accept++;
2373 }
2374 break;
2375
2376 case TCP_WINDOW_CLAMP:
2377 if (val==0) {
2378 if (sk->state != TCP_CLOSE) {
2379 err = -EINVAL;
2380 break;
2381 }
2382 tp->window_clamp = 0;
2383 } else {
2384 tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2385 SOCK_MIN_RCVBUF/2 : val;
2386 }
2387 break;
2388
2389 case TCP_QUICKACK:
2390 if (!val) {
2391 tp->ack.pingpong = 1;
2392 } else {
2393 tp->ack.pingpong = 0;
2394 if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2395 tcp_ack_scheduled(tp)) {
2396 tp->ack.pending |= TCP_ACK_PUSHED;
2397 cleanup_rbuf(sk, 1);
2398 if (!(val & 1))
2399 tp->ack.pingpong = 1;
2400 }
2401 }
2402 break;
2403
2404 default:
2405 err = -ENOPROTOOPT;
2406 break;
2407 };
2408 release_sock(sk);
2409 return err;
2410 }
2411
tcp_getsockopt(struct sock * sk,int level,int optname,char * optval,int * optlen)2412 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2413 int *optlen)
2414 {
2415 struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2416 int val, len;
2417
2418 if(level != SOL_TCP)
2419 return tp->af_specific->getsockopt(sk, level, optname,
2420 optval, optlen);
2421
2422 if(get_user(len,optlen))
2423 return -EFAULT;
2424
2425 len = min_t(unsigned int, len, sizeof(int));
2426
2427 if(len < 0)
2428 return -EINVAL;
2429
2430 switch(optname) {
2431 case TCP_MAXSEG:
2432 val = tp->mss_cache;
2433 if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2434 val = tp->user_mss;
2435 break;
2436 case TCP_NODELAY:
2437 val = (tp->nonagle == 1);
2438 break;
2439 case TCP_CORK:
2440 val = (tp->nonagle == 2);
2441 break;
2442 case TCP_KEEPIDLE:
2443 val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2444 break;
2445 case TCP_KEEPINTVL:
2446 val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2447 break;
2448 case TCP_KEEPCNT:
2449 val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2450 break;
2451 case TCP_SYNCNT:
2452 val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2453 break;
2454 case TCP_LINGER2:
2455 val = tp->linger2;
2456 if (val >= 0)
2457 val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2458 break;
2459 case TCP_DEFER_ACCEPT:
2460 val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2461 break;
2462 case TCP_WINDOW_CLAMP:
2463 val = tp->window_clamp;
2464 break;
2465 case TCP_INFO:
2466 {
2467 struct tcp_info info;
2468 u32 now = tcp_time_stamp;
2469
2470 if(get_user(len,optlen))
2471 return -EFAULT;
2472 info.tcpi_state = sk->state;
2473 info.tcpi_ca_state = tp->ca_state;
2474 info.tcpi_retransmits = tp->retransmits;
2475 info.tcpi_probes = tp->probes_out;
2476 info.tcpi_backoff = tp->backoff;
2477 info.tcpi_options = 0;
2478 if (tp->tstamp_ok)
2479 info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2480 if (tp->sack_ok)
2481 info.tcpi_options |= TCPI_OPT_SACK;
2482 if (tp->wscale_ok) {
2483 info.tcpi_options |= TCPI_OPT_WSCALE;
2484 info.tcpi_snd_wscale = tp->snd_wscale;
2485 info.tcpi_rcv_wscale = tp->rcv_wscale;
2486 } else {
2487 info.tcpi_snd_wscale = 0;
2488 info.tcpi_rcv_wscale = 0;
2489 }
2490 if (tp->ecn_flags&TCP_ECN_OK)
2491 info.tcpi_options |= TCPI_OPT_ECN;
2492
2493 info.tcpi_rto = (1000000*tp->rto)/HZ;
2494 info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2495 info.tcpi_snd_mss = tp->mss_cache;
2496 info.tcpi_rcv_mss = tp->ack.rcv_mss;
2497
2498 info.tcpi_unacked = tp->packets_out;
2499 info.tcpi_sacked = tp->sacked_out;
2500 info.tcpi_lost = tp->lost_out;
2501 info.tcpi_retrans = tp->retrans_out;
2502 info.tcpi_fackets = tp->fackets_out;
2503
2504 info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2505 info.tcpi_last_ack_sent = 0;
2506 info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2507 info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2508
2509 info.tcpi_pmtu = tp->pmtu_cookie;
2510 info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2511 info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2512 info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2513 info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2514 info.tcpi_snd_cwnd = tp->snd_cwnd;
2515 info.tcpi_advmss = tp->advmss;
2516 info.tcpi_reordering = tp->reordering;
2517
2518 len = min_t(unsigned int, len, sizeof(info));
2519 if(put_user(len, optlen))
2520 return -EFAULT;
2521 if(copy_to_user(optval, &info,len))
2522 return -EFAULT;
2523 return 0;
2524 }
2525 case TCP_QUICKACK:
2526 val = !tp->ack.pingpong;
2527 break;
2528 default:
2529 return -ENOPROTOOPT;
2530 };
2531
2532 if(put_user(len, optlen))
2533 return -EFAULT;
2534 if(copy_to_user(optval, &val,len))
2535 return -EFAULT;
2536 return 0;
2537 }
2538
2539
2540 extern void __skb_cb_too_small_for_tcp(int, int);
2541 extern void tcpdiag_init(void);
2542
tcp_init(void)2543 void __init tcp_init(void)
2544 {
2545 struct sk_buff *skb = NULL;
2546 unsigned long goal;
2547 int order, i;
2548
2549 if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2550 __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2551 sizeof(skb->cb));
2552
2553 tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2554 sizeof(struct open_request),
2555 0, SLAB_HWCACHE_ALIGN,
2556 NULL, NULL);
2557 if(!tcp_openreq_cachep)
2558 panic("tcp_init: Cannot alloc open_request cache.");
2559
2560 tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2561 sizeof(struct tcp_bind_bucket),
2562 0, SLAB_HWCACHE_ALIGN,
2563 NULL, NULL);
2564 if(!tcp_bucket_cachep)
2565 panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2566
2567 tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2568 sizeof(struct tcp_tw_bucket),
2569 0, SLAB_HWCACHE_ALIGN,
2570 NULL, NULL);
2571 if(!tcp_timewait_cachep)
2572 panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2573
2574 /* Size and allocate the main established and bind bucket
2575 * hash tables.
2576 *
2577 * The methodology is similar to that of the buffer cache.
2578 */
2579 if (num_physpages >= (128 * 1024))
2580 goal = num_physpages >> (21 - PAGE_SHIFT);
2581 else
2582 goal = num_physpages >> (23 - PAGE_SHIFT);
2583
2584 for(order = 0; (1UL << order) < goal; order++)
2585 ;
2586 do {
2587 tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2588 sizeof(struct tcp_ehash_bucket);
2589 tcp_ehash_size >>= 1;
2590 while (tcp_ehash_size & (tcp_ehash_size-1))
2591 tcp_ehash_size--;
2592 tcp_ehash = (struct tcp_ehash_bucket *)
2593 __get_free_pages(GFP_ATOMIC, order);
2594 } while (tcp_ehash == NULL && --order > 0);
2595
2596 if (!tcp_ehash)
2597 panic("Failed to allocate TCP established hash table\n");
2598 for (i = 0; i < (tcp_ehash_size<<1); i++) {
2599 tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2600 tcp_ehash[i].chain = NULL;
2601 }
2602
2603 do {
2604 tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2605 sizeof(struct tcp_bind_hashbucket);
2606 if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2607 continue;
2608 tcp_bhash = (struct tcp_bind_hashbucket *)
2609 __get_free_pages(GFP_ATOMIC, order);
2610 } while (tcp_bhash == NULL && --order >= 0);
2611
2612 if (!tcp_bhash)
2613 panic("Failed to allocate TCP bind hash table\n");
2614 for (i = 0; i < tcp_bhash_size; i++) {
2615 tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2616 tcp_bhash[i].chain = NULL;
2617 }
2618
2619 /* Try to be a bit smarter and adjust defaults depending
2620 * on available memory.
2621 */
2622 if (order > 4) {
2623 sysctl_local_port_range[0] = 32768;
2624 sysctl_local_port_range[1] = 61000;
2625 sysctl_tcp_max_tw_buckets = 180000;
2626 sysctl_tcp_max_orphans = 4096<<(order-4);
2627 sysctl_max_syn_backlog = 1024;
2628 } else if (order < 3) {
2629 sysctl_local_port_range[0] = 1024*(3-order);
2630 sysctl_tcp_max_tw_buckets >>= (3-order);
2631 sysctl_tcp_max_orphans >>= (3-order);
2632 sysctl_max_syn_backlog = 128;
2633 }
2634 tcp_port_rover = sysctl_local_port_range[0] - 1;
2635
2636 sysctl_tcp_mem[0] = 768<<order;
2637 sysctl_tcp_mem[1] = 1024<<order;
2638 sysctl_tcp_mem[2] = 1536<<order;
2639
2640 if (order < 3) {
2641 sysctl_tcp_wmem[2] = 64*1024;
2642 sysctl_tcp_rmem[0] = PAGE_SIZE;
2643 sysctl_tcp_rmem[1] = 43689;
2644 sysctl_tcp_rmem[2] = 2*43689;
2645 }
2646
2647 printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2648 tcp_ehash_size<<1, tcp_bhash_size);
2649
2650 (void) tcp_mib_init();
2651 tcpdiag_init();
2652 }
2653