1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Implementation of the Transmission Control Protocol(TCP).
7  *
8  * Version:	$Id: tcp.c,v 1.215 2001/10/31 08:17:58 davem Exp $
9  *
10  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Mark Evans, <evansmp@uhura.aston.ac.uk>
13  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
16  *		Linus Torvalds, <torvalds@cs.helsinki.fi>
17  *		Alan Cox, <gw4pts@gw4pts.ampr.org>
18  *		Matthew Dillon, <dillon@apollo.west.oic.com>
19  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
20  *		Jorge Cwik, <jorge@laser.satlink.net>
21  *
22  * Fixes:
23  *		Alan Cox	:	Numerous verify_area() calls
24  *		Alan Cox	:	Set the ACK bit on a reset
25  *		Alan Cox	:	Stopped it crashing if it closed while
26  *					sk->inuse=1 and was trying to connect
27  *					(tcp_err()).
28  *		Alan Cox	:	All icmp error handling was broken
29  *					pointers passed where wrong and the
30  *					socket was looked up backwards. Nobody
31  *					tested any icmp error code obviously.
32  *		Alan Cox	:	tcp_err() now handled properly. It
33  *					wakes people on errors. poll
34  *					behaves and the icmp error race
35  *					has gone by moving it into sock.c
36  *		Alan Cox	:	tcp_send_reset() fixed to work for
37  *					everything not just packets for
38  *					unknown sockets.
39  *		Alan Cox	:	tcp option processing.
40  *		Alan Cox	:	Reset tweaked (still not 100%) [Had
41  *					syn rule wrong]
42  *		Herp Rosmanith  :	More reset fixes
43  *		Alan Cox	:	No longer acks invalid rst frames.
44  *					Acking any kind of RST is right out.
45  *		Alan Cox	:	Sets an ignore me flag on an rst
46  *					receive otherwise odd bits of prattle
47  *					escape still
48  *		Alan Cox	:	Fixed another acking RST frame bug.
49  *					Should stop LAN workplace lockups.
50  *		Alan Cox	: 	Some tidyups using the new skb list
51  *					facilities
52  *		Alan Cox	:	sk->keepopen now seems to work
53  *		Alan Cox	:	Pulls options out correctly on accepts
54  *		Alan Cox	:	Fixed assorted sk->rqueue->next errors
55  *		Alan Cox	:	PSH doesn't end a TCP read. Switched a
56  *					bit to skb ops.
57  *		Alan Cox	:	Tidied tcp_data to avoid a potential
58  *					nasty.
59  *		Alan Cox	:	Added some better commenting, as the
60  *					tcp is hard to follow
61  *		Alan Cox	:	Removed incorrect check for 20 * psh
62  *	Michael O'Reilly	:	ack < copied bug fix.
63  *	Johannes Stille		:	Misc tcp fixes (not all in yet).
64  *		Alan Cox	:	FIN with no memory -> CRASH
65  *		Alan Cox	:	Added socket option proto entries.
66  *					Also added awareness of them to accept.
67  *		Alan Cox	:	Added TCP options (SOL_TCP)
68  *		Alan Cox	:	Switched wakeup calls to callbacks,
69  *					so the kernel can layer network
70  *					sockets.
71  *		Alan Cox	:	Use ip_tos/ip_ttl settings.
72  *		Alan Cox	:	Handle FIN (more) properly (we hope).
73  *		Alan Cox	:	RST frames sent on unsynchronised
74  *					state ack error.
75  *		Alan Cox	:	Put in missing check for SYN bit.
76  *		Alan Cox	:	Added tcp_select_window() aka NET2E
77  *					window non shrink trick.
78  *		Alan Cox	:	Added a couple of small NET2E timer
79  *					fixes
80  *		Charles Hedrick :	TCP fixes
81  *		Toomas Tamm	:	TCP window fixes
82  *		Alan Cox	:	Small URG fix to rlogin ^C ack fight
83  *		Charles Hedrick	:	Rewrote most of it to actually work
84  *		Linus		:	Rewrote tcp_read() and URG handling
85  *					completely
86  *		Gerhard Koerting:	Fixed some missing timer handling
87  *		Matthew Dillon  :	Reworked TCP machine states as per RFC
88  *		Gerhard Koerting:	PC/TCP workarounds
89  *		Adam Caldwell	:	Assorted timer/timing errors
90  *		Matthew Dillon	:	Fixed another RST bug
91  *		Alan Cox	:	Move to kernel side addressing changes.
92  *		Alan Cox	:	Beginning work on TCP fastpathing
93  *					(not yet usable)
94  *		Arnt Gulbrandsen:	Turbocharged tcp_check() routine.
95  *		Alan Cox	:	TCP fast path debugging
96  *		Alan Cox	:	Window clamping
97  *		Michael Riepe	:	Bug in tcp_check()
98  *		Matt Dillon	:	More TCP improvements and RST bug fixes
99  *		Matt Dillon	:	Yet more small nasties remove from the
100  *					TCP code (Be very nice to this man if
101  *					tcp finally works 100%) 8)
102  *		Alan Cox	:	BSD accept semantics.
103  *		Alan Cox	:	Reset on closedown bug.
104  *	Peter De Schrijver	:	ENOTCONN check missing in tcp_sendto().
105  *		Michael Pall	:	Handle poll() after URG properly in
106  *					all cases.
107  *		Michael Pall	:	Undo the last fix in tcp_read_urg()
108  *					(multi URG PUSH broke rlogin).
109  *		Michael Pall	:	Fix the multi URG PUSH problem in
110  *					tcp_readable(), poll() after URG
111  *					works now.
112  *		Michael Pall	:	recv(...,MSG_OOB) never blocks in the
113  *					BSD api.
114  *		Alan Cox	:	Changed the semantics of sk->socket to
115  *					fix a race and a signal problem with
116  *					accept() and async I/O.
117  *		Alan Cox	:	Relaxed the rules on tcp_sendto().
118  *		Yury Shevchuk	:	Really fixed accept() blocking problem.
119  *		Craig I. Hagan  :	Allow for BSD compatible TIME_WAIT for
120  *					clients/servers which listen in on
121  *					fixed ports.
122  *		Alan Cox	:	Cleaned the above up and shrank it to
123  *					a sensible code size.
124  *		Alan Cox	:	Self connect lockup fix.
125  *		Alan Cox	:	No connect to multicast.
126  *		Ross Biro	:	Close unaccepted children on master
127  *					socket close.
128  *		Alan Cox	:	Reset tracing code.
129  *		Alan Cox	:	Spurious resets on shutdown.
130  *		Alan Cox	:	Giant 15 minute/60 second timer error
131  *		Alan Cox	:	Small whoops in polling before an
132  *					accept.
133  *		Alan Cox	:	Kept the state trace facility since
134  *					it's handy for debugging.
135  *		Alan Cox	:	More reset handler fixes.
136  *		Alan Cox	:	Started rewriting the code based on
137  *					the RFC's for other useful protocol
138  *					references see: Comer, KA9Q NOS, and
139  *					for a reference on the difference
140  *					between specifications and how BSD
141  *					works see the 4.4lite source.
142  *		A.N.Kuznetsov	:	Don't time wait on completion of tidy
143  *					close.
144  *		Linus Torvalds	:	Fin/Shutdown & copied_seq changes.
145  *		Linus Torvalds	:	Fixed BSD port reuse to work first syn
146  *		Alan Cox	:	Reimplemented timers as per the RFC
147  *					and using multiple timers for sanity.
148  *		Alan Cox	:	Small bug fixes, and a lot of new
149  *					comments.
150  *		Alan Cox	:	Fixed dual reader crash by locking
151  *					the buffers (much like datagram.c)
152  *		Alan Cox	:	Fixed stuck sockets in probe. A probe
153  *					now gets fed up of retrying without
154  *					(even a no space) answer.
155  *		Alan Cox	:	Extracted closing code better
156  *		Alan Cox	:	Fixed the closing state machine to
157  *					resemble the RFC.
158  *		Alan Cox	:	More 'per spec' fixes.
159  *		Jorge Cwik	:	Even faster checksumming.
160  *		Alan Cox	:	tcp_data() doesn't ack illegal PSH
161  *					only frames. At least one pc tcp stack
162  *					generates them.
163  *		Alan Cox	:	Cache last socket.
164  *		Alan Cox	:	Per route irtt.
165  *		Matt Day	:	poll()->select() match BSD precisely on error
166  *		Alan Cox	:	New buffers
167  *		Marc Tamsky	:	Various sk->prot->retransmits and
168  *					sk->retransmits misupdating fixed.
169  *					Fixed tcp_write_timeout: stuck close,
170  *					and TCP syn retries gets used now.
171  *		Mark Yarvis	:	In tcp_read_wakeup(), don't send an
172  *					ack if state is TCP_CLOSED.
173  *		Alan Cox	:	Look up device on a retransmit - routes may
174  *					change. Doesn't yet cope with MSS shrink right
175  *					but its a start!
176  *		Marc Tamsky	:	Closing in closing fixes.
177  *		Mike Shaver	:	RFC1122 verifications.
178  *		Alan Cox	:	rcv_saddr errors.
179  *		Alan Cox	:	Block double connect().
180  *		Alan Cox	:	Small hooks for enSKIP.
181  *		Alexey Kuznetsov:	Path MTU discovery.
182  *		Alan Cox	:	Support soft errors.
183  *		Alan Cox	:	Fix MTU discovery pathological case
184  *					when the remote claims no mtu!
185  *		Marc Tamsky	:	TCP_CLOSE fix.
186  *		Colin (G3TNE)	:	Send a reset on syn ack replies in
187  *					window but wrong (fixes NT lpd problems)
188  *		Pedro Roque	:	Better TCP window handling, delayed ack.
189  *		Joerg Reuter	:	No modification of locked buffers in
190  *					tcp_do_retransmit()
191  *		Eric Schenk	:	Changed receiver side silly window
192  *					avoidance algorithm to BSD style
193  *					algorithm. This doubles throughput
194  *					against machines running Solaris,
195  *					and seems to result in general
196  *					improvement.
197  *	Stefan Magdalinski	:	adjusted tcp_readable() to fix FIONREAD
198  *	Willy Konynenberg	:	Transparent proxying support.
199  *	Mike McLagan		:	Routing by source
200  *		Keith Owens	:	Do proper merging with partial SKB's in
201  *					tcp_do_sendmsg to avoid burstiness.
202  *		Eric Schenk	:	Fix fast close down bug with
203  *					shutdown() followed by close().
204  *		Andi Kleen 	:	Make poll agree with SIGIO
205  *	Salvatore Sanfilippo	:	Support SO_LINGER with linger == 1 and
206  *					lingertime == 0 (RFC 793 ABORT Call)
207  *
208  *		This program is free software; you can redistribute it and/or
209  *		modify it under the terms of the GNU General Public License
210  *		as published by the Free Software Foundation; either version
211  *		2 of the License, or(at your option) any later version.
212  *
213  * Description of States:
214  *
215  *	TCP_SYN_SENT		sent a connection request, waiting for ack
216  *
217  *	TCP_SYN_RECV		received a connection request, sent ack,
218  *				waiting for final ack in three-way handshake.
219  *
220  *	TCP_ESTABLISHED		connection established
221  *
222  *	TCP_FIN_WAIT1		our side has shutdown, waiting to complete
223  *				transmission of remaining buffered data
224  *
225  *	TCP_FIN_WAIT2		all buffered data sent, waiting for remote
226  *				to shutdown
227  *
228  *	TCP_CLOSING		both sides have shutdown but we still have
229  *				data we have to finish sending
230  *
231  *	TCP_TIME_WAIT		timeout to catch resent junk before entering
232  *				closed, can only be entered from FIN_WAIT2
233  *				or CLOSING.  Required because the other end
234  *				may not have gotten our last ACK causing it
235  *				to retransmit the data packet (which we ignore)
236  *
237  *	TCP_CLOSE_WAIT		remote side has shutdown and is waiting for
238  *				us to finish writing our data and to shutdown
239  *				(we have to close() to move on to LAST_ACK)
240  *
241  *	TCP_LAST_ACK		out side has shutdown after remote has
242  *				shutdown.  There may still be data in our
243  *				buffer that we have to finish sending
244  *
245  *	TCP_CLOSE		socket is finished
246  */
247 
248 #include <linux/config.h>
249 #include <linux/types.h>
250 #include <linux/fcntl.h>
251 #include <linux/poll.h>
252 #include <linux/init.h>
253 #include <linux/smp_lock.h>
254 #include <linux/fs.h>
255 #include <linux/random.h>
256 
257 #include <net/icmp.h>
258 #include <net/tcp.h>
259 
260 #include <asm/uaccess.h>
261 #include <asm/ioctls.h>
262 
263 int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
264 
265 struct tcp_mib	tcp_statistics[NR_CPUS*2];
266 
267 kmem_cache_t *tcp_openreq_cachep;
268 kmem_cache_t *tcp_bucket_cachep;
269 kmem_cache_t *tcp_timewait_cachep;
270 
271 atomic_t tcp_orphan_count = ATOMIC_INIT(0);
272 
273 int sysctl_tcp_default_win_scale = 0;
274 
275 int sysctl_tcp_mem[3];
276 int sysctl_tcp_wmem[3] = { 4*1024, 16*1024, 128*1024 };
277 int sysctl_tcp_rmem[3] = { 4*1024, 87380, 87380*2 };
278 
279 atomic_t tcp_memory_allocated;	/* Current allocated memory. */
280 atomic_t tcp_sockets_allocated;	/* Current number of TCP sockets. */
281 
282 /* Pressure flag: try to collapse.
283  * Technical note: it is used by multiple contexts non atomically.
284  * All the tcp_mem_schedule() is of this nature: accounting
285  * is strict, actions are advisory and have some latency. */
286 int tcp_memory_pressure;
287 
288 #define TCP_PAGES(amt) (((amt)+TCP_MEM_QUANTUM-1)/TCP_MEM_QUANTUM)
289 
tcp_mem_schedule(struct sock * sk,int size,int kind)290 int tcp_mem_schedule(struct sock *sk, int size, int kind)
291 {
292 	int amt = TCP_PAGES(size);
293 
294 	sk->forward_alloc += amt*TCP_MEM_QUANTUM;
295 	atomic_add(amt, &tcp_memory_allocated);
296 
297 	/* Under limit. */
298 	if (atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) {
299 		if (tcp_memory_pressure)
300 			tcp_memory_pressure = 0;
301 		return 1;
302 	}
303 
304 	/* Over hard limit. */
305 	if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) {
306 		tcp_enter_memory_pressure();
307 		goto suppress_allocation;
308 	}
309 
310 	/* Under pressure. */
311 	if (atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[1])
312 		tcp_enter_memory_pressure();
313 
314 	if (kind) {
315 		if (atomic_read(&sk->rmem_alloc) < sysctl_tcp_rmem[0])
316 			return 1;
317 	} else {
318 		if (sk->wmem_queued < sysctl_tcp_wmem[0])
319 			return 1;
320 	}
321 
322 	if (!tcp_memory_pressure ||
323 	    sysctl_tcp_mem[2] > atomic_read(&tcp_sockets_allocated)
324 	    * TCP_PAGES(sk->wmem_queued+atomic_read(&sk->rmem_alloc)+
325 			sk->forward_alloc))
326 		return 1;
327 
328 suppress_allocation:
329 
330 	if (kind == 0) {
331 		tcp_moderate_sndbuf(sk);
332 
333 		/* Fail only if socket is _under_ its sndbuf.
334 		 * In this case we cannot block, so that we have to fail.
335 		 */
336 		if (sk->wmem_queued+size >= sk->sndbuf)
337 			return 1;
338 	}
339 
340 	/* Alas. Undo changes. */
341 	sk->forward_alloc -= amt*TCP_MEM_QUANTUM;
342 	atomic_sub(amt, &tcp_memory_allocated);
343 	return 0;
344 }
345 
__tcp_mem_reclaim(struct sock * sk)346 void __tcp_mem_reclaim(struct sock *sk)
347 {
348 	if (sk->forward_alloc >= TCP_MEM_QUANTUM) {
349 		atomic_sub(sk->forward_alloc/TCP_MEM_QUANTUM, &tcp_memory_allocated);
350 		sk->forward_alloc &= (TCP_MEM_QUANTUM-1);
351 		if (tcp_memory_pressure &&
352 		    atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0])
353 			tcp_memory_pressure = 0;
354 	}
355 }
356 
tcp_rfree(struct sk_buff * skb)357 void tcp_rfree(struct sk_buff *skb)
358 {
359 	struct sock *sk = skb->sk;
360 
361 	atomic_sub(skb->truesize, &sk->rmem_alloc);
362 	sk->forward_alloc += skb->truesize;
363 }
364 
365 /*
366  * LISTEN is a special case for poll..
367  */
tcp_listen_poll(struct sock * sk,poll_table * wait)368 static __inline__ unsigned int tcp_listen_poll(struct sock *sk, poll_table *wait)
369 {
370 	return sk->tp_pinfo.af_tcp.accept_queue ? (POLLIN | POLLRDNORM) : 0;
371 }
372 
373 /*
374  *	Wait for a TCP event.
375  *
376  *	Note that we don't need to lock the socket, as the upper poll layers
377  *	take care of normal races (between the test and the event) and we don't
378  *	go look at any of the socket buffers directly.
379  */
tcp_poll(struct file * file,struct socket * sock,poll_table * wait)380 unsigned int tcp_poll(struct file * file, struct socket *sock, poll_table *wait)
381 {
382 	unsigned int mask;
383 	struct sock *sk = sock->sk;
384 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
385 
386 	poll_wait(file, sk->sleep, wait);
387 	if (sk->state == TCP_LISTEN)
388 		return tcp_listen_poll(sk, wait);
389 
390 	/* Socket is not locked. We are protected from async events
391 	   by poll logic and correct handling of state changes
392 	   made by another threads is impossible in any case.
393 	 */
394 
395 	mask = 0;
396 	if (sk->err)
397 		mask = POLLERR;
398 
399 	/*
400 	 * POLLHUP is certainly not done right. But poll() doesn't
401 	 * have a notion of HUP in just one direction, and for a
402 	 * socket the read side is more interesting.
403 	 *
404 	 * Some poll() documentation says that POLLHUP is incompatible
405 	 * with the POLLOUT/POLLWR flags, so somebody should check this
406 	 * all. But careful, it tends to be safer to return too many
407 	 * bits than too few, and you can easily break real applications
408 	 * if you don't tell them that something has hung up!
409 	 *
410 	 * Check-me.
411 	 *
412 	 * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and
413 	 * our fs/select.c). It means that after we received EOF,
414 	 * poll always returns immediately, making impossible poll() on write()
415 	 * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP
416 	 * if and only if shutdown has been made in both directions.
417 	 * Actually, it is interesting to look how Solaris and DUX
418 	 * solve this dilemma. I would prefer, if PULLHUP were maskable,
419 	 * then we could set it on SND_SHUTDOWN. BTW examples given
420 	 * in Stevens' books assume exactly this behaviour, it explains
421 	 * why PULLHUP is incompatible with POLLOUT.	--ANK
422 	 *
423 	 * NOTE. Check for TCP_CLOSE is added. The goal is to prevent
424 	 * blocking on fresh not-connected or disconnected socket. --ANK
425 	 */
426 	if (sk->shutdown == SHUTDOWN_MASK || sk->state == TCP_CLOSE)
427 		mask |= POLLHUP;
428 	if (sk->shutdown & RCV_SHUTDOWN)
429 		mask |= POLLIN | POLLRDNORM;
430 
431 	/* Connected? */
432 	if ((1 << sk->state) & ~(TCPF_SYN_SENT|TCPF_SYN_RECV)) {
433 		/* Potential race condition. If read of tp below will
434 		 * escape above sk->state, we can be illegally awaken
435 		 * in SYN_* states. */
436 		if ((tp->rcv_nxt != tp->copied_seq) &&
437 		    (tp->urg_seq != tp->copied_seq ||
438 		     tp->rcv_nxt != tp->copied_seq+1 ||
439 		     sk->urginline || !tp->urg_data))
440 			mask |= POLLIN | POLLRDNORM;
441 
442 		if (!(sk->shutdown & SEND_SHUTDOWN)) {
443 			if (tcp_wspace(sk) >= tcp_min_write_space(sk)) {
444 				mask |= POLLOUT | POLLWRNORM;
445 			} else {  /* send SIGIO later */
446 				set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
447 				set_bit(SOCK_NOSPACE, &sk->socket->flags);
448 
449 				/* Race breaker. If space is freed after
450 				 * wspace test but before the flags are set,
451 				 * IO signal will be lost.
452 				 */
453 				if (tcp_wspace(sk) >= tcp_min_write_space(sk))
454 					mask |= POLLOUT | POLLWRNORM;
455 			}
456 		}
457 
458 		if (tp->urg_data & TCP_URG_VALID)
459 			mask |= POLLPRI;
460 	}
461 	return mask;
462 }
463 
464 /*
465  *	TCP socket write_space callback.
466  */
tcp_write_space(struct sock * sk)467 void tcp_write_space(struct sock *sk)
468 {
469 	struct socket *sock = sk->socket;
470 
471 	if (tcp_wspace(sk) >= tcp_min_write_space(sk) && sock) {
472 		clear_bit(SOCK_NOSPACE, &sock->flags);
473 
474 		if (sk->sleep && waitqueue_active(sk->sleep))
475 			wake_up_interruptible(sk->sleep);
476 
477 		if (sock->fasync_list && !(sk->shutdown&SEND_SHUTDOWN))
478 			sock_wake_async(sock, 2, POLL_OUT);
479 	}
480 }
481 
tcp_ioctl(struct sock * sk,int cmd,unsigned long arg)482 int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
483 {
484 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
485 	int answ;
486 
487 	switch(cmd) {
488 	case SIOCINQ:
489 		if (sk->state == TCP_LISTEN)
490 			return(-EINVAL);
491 
492 		lock_sock(sk);
493 		if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
494 			answ = 0;
495 		else if (sk->urginline || !tp->urg_data ||
496 			 before(tp->urg_seq,tp->copied_seq) ||
497 			 !before(tp->urg_seq,tp->rcv_nxt)) {
498 			answ = tp->rcv_nxt - tp->copied_seq;
499 
500 			/* Subtract 1, if FIN is in queue. */
501 			if (answ && !skb_queue_empty(&sk->receive_queue))
502 				answ -= ((struct sk_buff*)sk->receive_queue.prev)->h.th->fin;
503 		} else
504 			answ = tp->urg_seq - tp->copied_seq;
505 		release_sock(sk);
506 		break;
507 	case SIOCATMARK:
508 		{
509 			answ = tp->urg_data && tp->urg_seq == tp->copied_seq;
510 			break;
511 		}
512 	case SIOCOUTQ:
513 		if (sk->state == TCP_LISTEN)
514 			return(-EINVAL);
515 
516 		if ((1<<sk->state) & (TCPF_SYN_SENT|TCPF_SYN_RECV))
517 			answ = 0;
518 		else
519 			answ = tp->write_seq - tp->snd_una;
520 		break;
521 	default:
522 		return(-ENOIOCTLCMD);
523 	};
524 
525 	return put_user(answ, (int *)arg);
526 }
527 
528 
tcp_listen_start(struct sock * sk)529 int tcp_listen_start(struct sock *sk)
530 {
531 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
532 	struct tcp_listen_opt *lopt;
533 
534 	sk->max_ack_backlog = 0;
535 	sk->ack_backlog = 0;
536 	tp->accept_queue = tp->accept_queue_tail = NULL;
537 	tp->syn_wait_lock = RW_LOCK_UNLOCKED;
538 	tcp_delack_init(tp);
539 
540 	lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
541 	if (!lopt)
542 		return -ENOMEM;
543 
544 	memset(lopt, 0, sizeof(struct tcp_listen_opt));
545 	for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
546 		if ((1<<lopt->max_qlen_log) >= sysctl_max_syn_backlog)
547 			break;
548 	get_random_bytes(&lopt->hash_rnd, 4);
549 
550 	write_lock_bh(&tp->syn_wait_lock);
551 	tp->listen_opt = lopt;
552 	write_unlock_bh(&tp->syn_wait_lock);
553 
554 	/* There is race window here: we announce ourselves listening,
555 	 * but this transition is still not validated by get_port().
556 	 * It is OK, because this socket enters to hash table only
557 	 * after validation is complete.
558 	 */
559 	sk->state = TCP_LISTEN;
560 	if (sk->prot->get_port(sk, sk->num) == 0) {
561 		sk->sport = htons(sk->num);
562 
563 		sk_dst_reset(sk);
564 		sk->prot->hash(sk);
565 
566 		return 0;
567 	}
568 
569 	sk->state = TCP_CLOSE;
570 	write_lock_bh(&tp->syn_wait_lock);
571 	tp->listen_opt = NULL;
572 	write_unlock_bh(&tp->syn_wait_lock);
573 	kfree(lopt);
574 	return -EADDRINUSE;
575 }
576 
577 /*
578  *	This routine closes sockets which have been at least partially
579  *	opened, but not yet accepted.
580  */
581 
tcp_listen_stop(struct sock * sk)582 static void tcp_listen_stop (struct sock *sk)
583 {
584 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
585 	struct tcp_listen_opt *lopt = tp->listen_opt;
586 	struct open_request *acc_req = tp->accept_queue;
587 	struct open_request *req;
588 	int i;
589 
590 	tcp_delete_keepalive_timer(sk);
591 
592 	/* make all the listen_opt local to us */
593 	write_lock_bh(&tp->syn_wait_lock);
594 	tp->listen_opt =NULL;
595 	write_unlock_bh(&tp->syn_wait_lock);
596 	tp->accept_queue = tp->accept_queue_tail = NULL;
597 
598 	if (lopt->qlen) {
599 		for (i=0; i<TCP_SYNQ_HSIZE; i++) {
600 			while ((req = lopt->syn_table[i]) != NULL) {
601 				lopt->syn_table[i] = req->dl_next;
602 				lopt->qlen--;
603 				tcp_openreq_free(req);
604 
605 		/* Following specs, it would be better either to send FIN
606 		 * (and enter FIN-WAIT-1, it is normal close)
607 		 * or to send active reset (abort).
608 		 * Certainly, it is pretty dangerous while synflood, but it is
609 		 * bad justification for our negligence 8)
610 		 * To be honest, we are not able to make either
611 		 * of the variants now.			--ANK
612 		 */
613 			}
614 		}
615 	}
616 	BUG_TRAP(lopt->qlen == 0);
617 
618 	kfree(lopt);
619 
620 	while ((req=acc_req) != NULL) {
621 		struct sock *child = req->sk;
622 
623 		acc_req = req->dl_next;
624 
625 		local_bh_disable();
626 		bh_lock_sock(child);
627 		BUG_TRAP(child->lock.users==0);
628 		sock_hold(child);
629 
630 		tcp_disconnect(child, O_NONBLOCK);
631 
632 		sock_orphan(child);
633 
634 		atomic_inc(&tcp_orphan_count);
635 
636 		tcp_destroy_sock(child);
637 
638 		bh_unlock_sock(child);
639 		local_bh_enable();
640 		sock_put(child);
641 
642 		tcp_acceptq_removed(sk);
643 		tcp_openreq_fastfree(req);
644 	}
645 	BUG_TRAP(sk->ack_backlog == 0);
646 }
647 
648 /*
649  *	Wait for a socket to get into the connected state
650  *
651  *	Note: Must be called with the socket locked.
652  */
wait_for_tcp_connect(struct sock * sk,int flags,long * timeo_p)653 static int wait_for_tcp_connect(struct sock * sk, int flags, long *timeo_p)
654 {
655 	struct task_struct *tsk = current;
656 	DECLARE_WAITQUEUE(wait, tsk);
657 
658 	while((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) {
659 		if(sk->err)
660 			return sock_error(sk);
661 		if((1 << sk->state) &
662 		   ~(TCPF_SYN_SENT | TCPF_SYN_RECV))
663 			return -EPIPE;
664 		if(!*timeo_p)
665 			return -EAGAIN;
666 		if(signal_pending(tsk))
667 			return sock_intr_errno(*timeo_p);
668 
669 		__set_task_state(tsk, TASK_INTERRUPTIBLE);
670 		add_wait_queue(sk->sleep, &wait);
671 		sk->tp_pinfo.af_tcp.write_pending++;
672 
673 		release_sock(sk);
674 		*timeo_p = schedule_timeout(*timeo_p);
675 		lock_sock(sk);
676 
677 		__set_task_state(tsk, TASK_RUNNING);
678 		remove_wait_queue(sk->sleep, &wait);
679 		sk->tp_pinfo.af_tcp.write_pending--;
680 	}
681 	return 0;
682 }
683 
tcp_memory_free(struct sock * sk)684 static inline int tcp_memory_free(struct sock *sk)
685 {
686 	return sk->wmem_queued < sk->sndbuf;
687 }
688 
689 /*
690  *	Wait for more memory for a socket
691  */
wait_for_tcp_memory(struct sock * sk,long * timeo)692 static int wait_for_tcp_memory(struct sock * sk, long *timeo)
693 {
694 	int err = 0;
695 	long vm_wait = 0;
696 	long current_timeo = *timeo;
697 	DECLARE_WAITQUEUE(wait, current);
698 
699 	if (tcp_memory_free(sk))
700 		current_timeo = vm_wait = (net_random()%(HZ/5))+2;
701 
702 	add_wait_queue(sk->sleep, &wait);
703 	for (;;) {
704 		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
705 
706 		set_current_state(TASK_INTERRUPTIBLE);
707 
708 		if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
709 			goto do_error;
710 		if (!*timeo)
711 			goto do_nonblock;
712 		if (signal_pending(current))
713 			goto do_interrupted;
714 		clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
715 		if (tcp_memory_free(sk) && !vm_wait)
716 			break;
717 
718 		set_bit(SOCK_NOSPACE, &sk->socket->flags);
719 		sk->tp_pinfo.af_tcp.write_pending++;
720 		release_sock(sk);
721 		if (!tcp_memory_free(sk) || vm_wait)
722 			current_timeo = schedule_timeout(current_timeo);
723 		lock_sock(sk);
724 		sk->tp_pinfo.af_tcp.write_pending--;
725 
726 		if (vm_wait) {
727 			vm_wait -= current_timeo;
728 			current_timeo = *timeo;
729 			if (current_timeo != MAX_SCHEDULE_TIMEOUT &&
730 			    (current_timeo -= vm_wait) < 0)
731 				current_timeo = 0;
732 			vm_wait = 0;
733 		}
734 		*timeo = current_timeo;
735 	}
736 out:
737 	current->state = TASK_RUNNING;
738 	remove_wait_queue(sk->sleep, &wait);
739 	return err;
740 
741 do_error:
742 	err = -EPIPE;
743 	goto out;
744 do_nonblock:
745 	err = -EAGAIN;
746 	goto out;
747 do_interrupted:
748 	err = sock_intr_errno(*timeo);
749 	goto out;
750 }
751 
752 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags);
753 
754 static inline int
can_coalesce(struct sk_buff * skb,int i,struct page * page,int off)755 can_coalesce(struct sk_buff *skb, int i, struct page *page, int off)
756 {
757 	if (i) {
758 		skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
759 		return page == frag->page &&
760 			off == frag->page_offset+frag->size;
761 	}
762 	return 0;
763 }
764 
765 static inline void
fill_page_desc(struct sk_buff * skb,int i,struct page * page,int off,int size)766 fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size)
767 {
768 	skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
769 	frag->page = page;
770 	frag->page_offset = off;
771 	frag->size = size;
772 	skb_shinfo(skb)->nr_frags = i+1;
773 }
774 
tcp_mark_push(struct tcp_opt * tp,struct sk_buff * skb)775 static inline void tcp_mark_push(struct tcp_opt *tp, struct sk_buff *skb)
776 {
777 	TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
778 	tp->pushed_seq = tp->write_seq;
779 }
780 
forced_push(struct tcp_opt * tp)781 static inline int forced_push(struct tcp_opt *tp)
782 {
783 	return after(tp->write_seq, tp->pushed_seq + (tp->max_window>>1));
784 }
785 
786 static inline void
skb_entail(struct sock * sk,struct tcp_opt * tp,struct sk_buff * skb)787 skb_entail(struct sock *sk, struct tcp_opt *tp, struct sk_buff *skb)
788 {
789 	skb->csum = 0;
790 	TCP_SKB_CB(skb)->seq = tp->write_seq;
791 	TCP_SKB_CB(skb)->end_seq = tp->write_seq;
792 	TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
793 	TCP_SKB_CB(skb)->sacked = 0;
794 	__skb_queue_tail(&sk->write_queue, skb);
795 	tcp_charge_skb(sk, skb);
796 	if (tp->send_head == NULL)
797 		tp->send_head = skb;
798 }
799 
800 static inline void
tcp_mark_urg(struct tcp_opt * tp,int flags,struct sk_buff * skb)801 tcp_mark_urg(struct tcp_opt *tp, int flags, struct sk_buff *skb)
802 {
803 	if (flags & MSG_OOB) {
804 		tp->urg_mode = 1;
805 		tp->snd_up = tp->write_seq;
806 		TCP_SKB_CB(skb)->sacked |= TCPCB_URG;
807 	}
808 }
809 
810 static inline void
tcp_push(struct sock * sk,struct tcp_opt * tp,int flags,int mss_now,int nonagle)811 tcp_push(struct sock *sk, struct tcp_opt *tp, int flags, int mss_now, int nonagle)
812 {
813 	if (tp->send_head) {
814 		struct sk_buff *skb = sk->write_queue.prev;
815 		if (!(flags&MSG_MORE) || forced_push(tp))
816 			tcp_mark_push(tp, skb);
817 		tcp_mark_urg(tp, flags, skb);
818 		__tcp_push_pending_frames(sk, tp, mss_now, (flags&MSG_MORE) ? 2 : nonagle);
819 	}
820 }
821 
tcp_error(struct sock * sk,int flags,int err)822 static int tcp_error(struct sock *sk, int flags, int err)
823 {
824 	if (err == -EPIPE)
825 		err = sock_error(sk) ? : -EPIPE;
826 	if (err == -EPIPE && !(flags&MSG_NOSIGNAL))
827 		send_sig(SIGPIPE, current, 0);
828 	return err;
829 }
830 
do_tcp_sendpages(struct sock * sk,struct page ** pages,int poffset,size_t psize,int flags)831 ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags)
832 {
833 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
834 	int mss_now;
835 	int err;
836 	ssize_t copied;
837 	long timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
838 
839 	/* Wait for a connection to finish. */
840 	if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
841 		if((err = wait_for_tcp_connect(sk, 0, &timeo)) != 0)
842 			goto out_err;
843 
844 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
845 
846 	mss_now = tcp_current_mss(sk);
847 	copied = 0;
848 
849 	err = -EPIPE;
850 	if (sk->err || (sk->shutdown & SEND_SHUTDOWN))
851 		goto do_error;
852 
853 	while (psize > 0) {
854 		struct sk_buff *skb = sk->write_queue.prev;
855 		int offset, size, copy, i;
856 		struct page *page;
857 
858 		page = pages[poffset/PAGE_SIZE];
859 		offset = poffset % PAGE_SIZE;
860 		size = min_t(size_t, psize, PAGE_SIZE-offset);
861 
862 		if (tp->send_head==NULL || (copy = mss_now - skb->len) <= 0) {
863 new_segment:
864 			if (!tcp_memory_free(sk))
865 				goto wait_for_sndbuf;
866 
867 			skb = tcp_alloc_pskb(sk, 0, tp->mss_cache, sk->allocation);
868 			if (skb == NULL)
869 				goto wait_for_memory;
870 
871 			skb_entail(sk, tp, skb);
872 			copy = mss_now;
873 		}
874 
875 		if (copy > size)
876 			copy = size;
877 
878 		i = skb_shinfo(skb)->nr_frags;
879 		if (can_coalesce(skb, i, page, offset)) {
880 			skb_shinfo(skb)->frags[i-1].size += copy;
881 		} else if (i < MAX_SKB_FRAGS) {
882 			get_page(page);
883 			fill_page_desc(skb, i, page, offset, copy);
884 		} else {
885 			tcp_mark_push(tp, skb);
886 			goto new_segment;
887 		}
888 
889 		skb->len += copy;
890 		skb->data_len += copy;
891 		skb->ip_summed = CHECKSUM_HW;
892 		tp->write_seq += copy;
893 		TCP_SKB_CB(skb)->end_seq += copy;
894 
895 		if (!copied)
896 			TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
897 
898 		copied += copy;
899 		poffset += copy;
900 		if (!(psize -= copy))
901 			goto out;
902 
903 		if (skb->len != mss_now || (flags&MSG_OOB))
904 			continue;
905 
906 		if (forced_push(tp)) {
907 			tcp_mark_push(tp, skb);
908 			__tcp_push_pending_frames(sk, tp, mss_now, 1);
909 		} else if (skb == tp->send_head)
910 			tcp_push_one(sk, mss_now);
911 		continue;
912 
913 wait_for_sndbuf:
914 		set_bit(SOCK_NOSPACE, &sk->socket->flags);
915 wait_for_memory:
916 		if (copied)
917 			tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
918 
919 		if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
920 			goto do_error;
921 
922 		mss_now = tcp_current_mss(sk);
923 	}
924 
925 out:
926 	if (copied)
927 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
928 	return copied;
929 
930 do_error:
931 	if (copied)
932 		goto out;
933 out_err:
934 	return tcp_error(sk, flags, err);
935 }
936 
tcp_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)937 ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
938 {
939 	ssize_t res;
940 	struct sock *sk = sock->sk;
941 
942 #define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM)
943 
944 	if (!(sk->route_caps & NETIF_F_SG) ||
945 	    !(sk->route_caps & TCP_ZC_CSUM_FLAGS))
946 		return sock_no_sendpage(sock, page, offset, size, flags);
947 
948 #undef TCP_ZC_CSUM_FLAGS
949 
950 	lock_sock(sk);
951 	TCP_CHECK_TIMER(sk);
952 	res = do_tcp_sendpages(sk, &page, offset, size, flags);
953 	TCP_CHECK_TIMER(sk);
954 	release_sock(sk);
955 	return res;
956 }
957 
958 #define TCP_PAGE(sk)	(sk->tp_pinfo.af_tcp.sndmsg_page)
959 #define TCP_OFF(sk)	(sk->tp_pinfo.af_tcp.sndmsg_off)
960 
961 static inline int
tcp_copy_to_page(struct sock * sk,char * from,struct sk_buff * skb,struct page * page,int off,int copy)962 tcp_copy_to_page(struct sock *sk, char *from, struct sk_buff *skb,
963 		 struct page *page, int off, int copy)
964 {
965 	int err = 0;
966 	unsigned int csum;
967 
968 	csum = csum_and_copy_from_user(from, page_address(page)+off,
969 				       copy, 0, &err);
970 	if (!err) {
971 		if (skb->ip_summed == CHECKSUM_NONE)
972 			skb->csum = csum_block_add(skb->csum, csum, skb->len);
973 		skb->len += copy;
974 		skb->data_len += copy;
975 		skb->truesize += copy;
976 		sk->wmem_queued += copy;
977 		sk->forward_alloc -= copy;
978 	}
979 	return err;
980 }
981 
982 static inline int
skb_add_data(struct sk_buff * skb,char * from,int copy)983 skb_add_data(struct sk_buff *skb, char *from, int copy)
984 {
985 	int err = 0;
986 	unsigned int csum;
987 	int off = skb->len;
988 
989 	csum = csum_and_copy_from_user(from, skb_put(skb, copy),
990 				       copy, 0, &err);
991 	if (!err) {
992 		skb->csum = csum_block_add(skb->csum, csum, off);
993 		return 0;
994 	}
995 
996 	__skb_trim(skb, off);
997 	return -EFAULT;
998 }
999 
select_size(struct sock * sk,struct tcp_opt * tp)1000 static inline int select_size(struct sock *sk, struct tcp_opt *tp)
1001 {
1002 	int tmp = tp->mss_cache;
1003 
1004 	if (sk->route_caps&NETIF_F_SG) {
1005 		int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER);
1006 
1007 		if (tmp >= pgbreak && tmp <= pgbreak + (MAX_SKB_FRAGS-1)*PAGE_SIZE)
1008 			tmp = pgbreak;
1009 	}
1010 	return tmp;
1011 }
1012 
tcp_sendmsg(struct sock * sk,struct msghdr * msg,int size)1013 int tcp_sendmsg(struct sock *sk, struct msghdr *msg, int size)
1014 {
1015 	struct iovec *iov;
1016 	struct tcp_opt *tp;
1017 	struct sk_buff *skb;
1018 	int iovlen, flags;
1019 	int mss_now;
1020 	int err, copied;
1021 	long timeo;
1022 
1023 	tp = &(sk->tp_pinfo.af_tcp);
1024 
1025 	lock_sock(sk);
1026 	TCP_CHECK_TIMER(sk);
1027 
1028 	flags = msg->msg_flags;
1029 	timeo = sock_sndtimeo(sk, flags&MSG_DONTWAIT);
1030 
1031 	/* Wait for a connection to finish. */
1032 	if ((1 << sk->state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))
1033 		if((err = wait_for_tcp_connect(sk, flags, &timeo)) != 0)
1034 			goto out_err;
1035 
1036 	/* This should be in poll */
1037 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
1038 
1039 	mss_now = tcp_current_mss(sk);
1040 
1041 	/* Ok commence sending. */
1042 	iovlen = msg->msg_iovlen;
1043 	iov = msg->msg_iov;
1044 	copied = 0;
1045 
1046 	err = -EPIPE;
1047 	if (sk->err || (sk->shutdown&SEND_SHUTDOWN))
1048 		goto do_error;
1049 
1050 	while (--iovlen >= 0) {
1051 		int seglen=iov->iov_len;
1052 		unsigned char * from=iov->iov_base;
1053 
1054 		iov++;
1055 
1056 		while (seglen > 0) {
1057 			int copy;
1058 
1059 			skb = sk->write_queue.prev;
1060 
1061 			if (tp->send_head == NULL ||
1062 			    (copy = mss_now - skb->len) <= 0) {
1063 
1064 new_segment:
1065 				/* Allocate new segment. If the interface is SG,
1066 				 * allocate skb fitting to single page.
1067 				 */
1068 				if (!tcp_memory_free(sk))
1069 					goto wait_for_sndbuf;
1070 
1071 				skb = tcp_alloc_pskb(sk, select_size(sk, tp), 0, sk->allocation);
1072 				if (skb == NULL)
1073 					goto wait_for_memory;
1074 
1075 				skb_entail(sk, tp, skb);
1076 				copy = mss_now;
1077 			}
1078 
1079 			/* Try to append data to the end of skb. */
1080 			if (copy > seglen)
1081 				copy = seglen;
1082 
1083 			/* Where to copy to? */
1084 			if (skb_tailroom(skb) > 0) {
1085 				/* We have some space in skb head. Superb! */
1086 				if (copy > skb_tailroom(skb))
1087 					copy = skb_tailroom(skb);
1088 				if ((err = skb_add_data(skb, from, copy)) != 0)
1089 					goto do_fault;
1090 			} else {
1091 				int merge = 0;
1092 				int i = skb_shinfo(skb)->nr_frags;
1093 				struct page *page = TCP_PAGE(sk);
1094 				int off = TCP_OFF(sk);
1095 
1096 				if (can_coalesce(skb, i, page, off) && off != PAGE_SIZE) {
1097 					/* We can extend the last page fragment. */
1098 					merge = 1;
1099 				} else if (i == MAX_SKB_FRAGS ||
1100 					   (i == 0 && !(sk->route_caps&NETIF_F_SG))) {
1101 					/* Need to add new fragment and cannot
1102 					 * do this because interface is non-SG,
1103 					 * or because all the page slots are busy.
1104 					 */
1105 					tcp_mark_push(tp, skb);
1106 					goto new_segment;
1107 				} else if (page) {
1108 					/* If page is cached, align
1109 					 * offset to L1 cache boundary
1110 					 */
1111 					off = (off+L1_CACHE_BYTES-1)&~(L1_CACHE_BYTES-1);
1112 					if (off == PAGE_SIZE) {
1113 						put_page(page);
1114 						TCP_PAGE(sk) = page = NULL;
1115 					}
1116 				}
1117 
1118 				if (!page) {
1119 					/* Allocate new cache page. */
1120 					if (!(page=tcp_alloc_page(sk)))
1121 						goto wait_for_memory;
1122 					off = 0;
1123 				}
1124 
1125 				if (copy > PAGE_SIZE-off)
1126 					copy = PAGE_SIZE-off;
1127 
1128 				/* Time to copy data. We are close to the end! */
1129 				err = tcp_copy_to_page(sk, from, skb, page, off, copy);
1130 				if (err) {
1131 					/* If this page was new, give it to the
1132 					 * socket so it does not get leaked.
1133 					 */
1134 					if (TCP_PAGE(sk) == NULL) {
1135 						TCP_PAGE(sk) = page;
1136 						TCP_OFF(sk) = 0;
1137 					}
1138 					goto do_error;
1139 				}
1140 
1141 				/* Update the skb. */
1142 				if (merge) {
1143 					skb_shinfo(skb)->frags[i-1].size += copy;
1144 				} else {
1145 					fill_page_desc(skb, i, page, off, copy);
1146 					if (TCP_PAGE(sk)) {
1147 						get_page(page);
1148 					} else if (off + copy < PAGE_SIZE) {
1149 						get_page(page);
1150 						TCP_PAGE(sk) = page;
1151 					}
1152 				}
1153 
1154 				TCP_OFF(sk) = off+copy;
1155 			}
1156 
1157 			if (!copied)
1158 				TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH;
1159 
1160 			tp->write_seq += copy;
1161 			TCP_SKB_CB(skb)->end_seq += copy;
1162 
1163 			from += copy;
1164 			copied += copy;
1165 			if ((seglen -= copy) == 0 && iovlen == 0)
1166 				goto out;
1167 
1168 			if (skb->len != mss_now || (flags&MSG_OOB))
1169 				continue;
1170 
1171 			if (forced_push(tp)) {
1172 				tcp_mark_push(tp, skb);
1173 				__tcp_push_pending_frames(sk, tp, mss_now, 1);
1174 			} else if (skb == tp->send_head)
1175 				tcp_push_one(sk, mss_now);
1176 			continue;
1177 
1178 wait_for_sndbuf:
1179 			set_bit(SOCK_NOSPACE, &sk->socket->flags);
1180 wait_for_memory:
1181 			if (copied)
1182 				tcp_push(sk, tp, flags&~MSG_MORE, mss_now, 1);
1183 
1184 			if ((err = wait_for_tcp_memory(sk, &timeo)) != 0)
1185 				goto do_error;
1186 
1187 			mss_now = tcp_current_mss(sk);
1188 		}
1189 	}
1190 
1191 out:
1192 	if (copied)
1193 		tcp_push(sk, tp, flags, mss_now, tp->nonagle);
1194 	TCP_CHECK_TIMER(sk);
1195 	release_sock(sk);
1196 	return copied;
1197 
1198 do_fault:
1199 	if (skb->len == 0) {
1200 		if (tp->send_head == skb)
1201 			tp->send_head = NULL;
1202 		__skb_unlink(skb, skb->list);
1203 		tcp_free_skb(sk, skb);
1204 	}
1205 
1206 do_error:
1207 	if (copied)
1208 		goto out;
1209 out_err:
1210 	err = tcp_error(sk, flags, err);
1211 	TCP_CHECK_TIMER(sk);
1212 	release_sock(sk);
1213 	return err;
1214 }
1215 
1216 /*
1217  *	Handle reading urgent data. BSD has very simple semantics for
1218  *	this, no blocking and very strange errors 8)
1219  */
1220 
tcp_recv_urg(struct sock * sk,long timeo,struct msghdr * msg,int len,int flags,int * addr_len)1221 static int tcp_recv_urg(struct sock * sk, long timeo,
1222 			struct msghdr *msg, int len, int flags,
1223 			int *addr_len)
1224 {
1225 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1226 
1227 	/* No URG data to read. */
1228 	if (sk->urginline || !tp->urg_data || tp->urg_data == TCP_URG_READ)
1229 		return -EINVAL;	/* Yes this is right ! */
1230 
1231 	if (sk->state==TCP_CLOSE && !sk->done)
1232 		return -ENOTCONN;
1233 
1234 	if (tp->urg_data & TCP_URG_VALID) {
1235 		int err = 0;
1236 		char c = tp->urg_data;
1237 
1238 		if (!(flags & MSG_PEEK))
1239 			tp->urg_data = TCP_URG_READ;
1240 
1241 		/* Read urgent data. */
1242 		msg->msg_flags|=MSG_OOB;
1243 
1244 		if(len>0) {
1245 			if (!(flags & MSG_TRUNC))
1246 				err = memcpy_toiovec(msg->msg_iov, &c, 1);
1247 			len = 1;
1248 		} else
1249 			msg->msg_flags|=MSG_TRUNC;
1250 
1251 		return err ? -EFAULT : len;
1252 	}
1253 
1254 	if (sk->state == TCP_CLOSE || (sk->shutdown & RCV_SHUTDOWN))
1255 		return 0;
1256 
1257 	/* Fixed the recv(..., MSG_OOB) behaviour.  BSD docs and
1258 	 * the available implementations agree in this case:
1259 	 * this call should never block, independent of the
1260 	 * blocking state of the socket.
1261 	 * Mike <pall@rz.uni-karlsruhe.de>
1262 	 */
1263 	return -EAGAIN;
1264 }
1265 
1266 /*
1267  *	Release a skb if it is no longer needed. This routine
1268  *	must be called with interrupts disabled or with the
1269  *	socket locked so that the sk_buff queue operation is ok.
1270  */
1271 
tcp_eat_skb(struct sock * sk,struct sk_buff * skb)1272 static inline void tcp_eat_skb(struct sock *sk, struct sk_buff * skb)
1273 {
1274 	__skb_unlink(skb, &sk->receive_queue);
1275 	__kfree_skb(skb);
1276 }
1277 
1278 /* Clean up the receive buffer for full frames taken by the user,
1279  * then send an ACK if necessary.  COPIED is the number of bytes
1280  * tcp_recvmsg has given to the user so far, it speeds up the
1281  * calculation of whether or not we must ACK for the sake of
1282  * a window update.
1283  */
cleanup_rbuf(struct sock * sk,int copied)1284 static void cleanup_rbuf(struct sock *sk, int copied)
1285 {
1286 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1287 	int time_to_ack = 0;
1288 
1289 #if TCP_DEBUG
1290 	struct sk_buff *skb = skb_peek(&sk->receive_queue);
1291 
1292 	BUG_TRAP(skb==NULL || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq));
1293 #endif
1294 
1295 	if (tcp_ack_scheduled(tp)) {
1296 		   /* Delayed ACKs frequently hit locked sockets during bulk receive. */
1297 		if (tp->ack.blocked
1298 		    /* Once-per-two-segments ACK was not sent by tcp_input.c */
1299 		    || tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss
1300 		    /*
1301 		     * If this read emptied read buffer, we send ACK, if
1302 		     * connection is not bidirectional, user drained
1303 		     * receive buffer and there was a small segment
1304 		     * in queue.
1305 		     */
1306 		    || (copied > 0 &&
1307 			(tp->ack.pending&TCP_ACK_PUSHED) &&
1308 			!tp->ack.pingpong &&
1309 			atomic_read(&sk->rmem_alloc) == 0)) {
1310 			time_to_ack = 1;
1311 		}
1312 	}
1313 
1314   	/* We send an ACK if we can now advertise a non-zero window
1315 	 * which has been raised "significantly".
1316 	 *
1317 	 * Even if window raised up to infinity, do not send window open ACK
1318 	 * in states, where we will not receive more. It is useless.
1319   	 */
1320 	if(copied > 0 && !time_to_ack && !(sk->shutdown&RCV_SHUTDOWN)) {
1321 		__u32 rcv_window_now = tcp_receive_window(tp);
1322 
1323 		/* Optimize, __tcp_select_window() is not cheap. */
1324 		if (2*rcv_window_now <= tp->window_clamp) {
1325 			__u32 new_window = __tcp_select_window(sk);
1326 
1327 			/* Send ACK now, if this read freed lots of space
1328 			 * in our buffer. Certainly, new_window is new window.
1329 			 * We can advertise it now, if it is not less than current one.
1330 			 * "Lots" means "at least twice" here.
1331 			 */
1332 			if(new_window && new_window >= 2*rcv_window_now)
1333 				time_to_ack = 1;
1334 		}
1335 	}
1336 	if (time_to_ack)
1337 		tcp_send_ack(sk);
1338 }
1339 
1340 /* Now socket state including sk->err is changed only under lock,
1341  * hence we may omit checks after joining wait queue.
1342  * We check receive queue before schedule() only as optimization;
1343  * it is very likely that release_sock() added new data.
1344  */
1345 
tcp_data_wait(struct sock * sk,long timeo)1346 static long tcp_data_wait(struct sock *sk, long timeo)
1347 {
1348 	DECLARE_WAITQUEUE(wait, current);
1349 
1350 	add_wait_queue(sk->sleep, &wait);
1351 
1352 	__set_current_state(TASK_INTERRUPTIBLE);
1353 
1354 	set_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1355 	release_sock(sk);
1356 
1357 	if (skb_queue_empty(&sk->receive_queue))
1358 		timeo = schedule_timeout(timeo);
1359 
1360 	lock_sock(sk);
1361 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->socket->flags);
1362 
1363 	remove_wait_queue(sk->sleep, &wait);
1364 	__set_current_state(TASK_RUNNING);
1365 	return timeo;
1366 }
1367 
tcp_prequeue_process(struct sock * sk)1368 static void tcp_prequeue_process(struct sock *sk)
1369 {
1370 	struct sk_buff *skb;
1371 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1372 
1373 	net_statistics[smp_processor_id()*2+1].TCPPrequeued += skb_queue_len(&tp->ucopy.prequeue);
1374 
1375 	/* RX process wants to run with disabled BHs, though it is not necessary */
1376 	local_bh_disable();
1377 	while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
1378 		sk->backlog_rcv(sk, skb);
1379 	local_bh_enable();
1380 
1381 	/* Clear memory counter. */
1382 	tp->ucopy.memory = 0;
1383 }
1384 
1385 static inline
tcp_recv_skb(struct sock * sk,u32 seq,u32 * off)1386 struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
1387 {
1388 	struct sk_buff *skb;
1389 	u32 offset;
1390 
1391 	skb_queue_walk(&sk->receive_queue, skb) {
1392 		offset = seq - TCP_SKB_CB(skb)->seq;
1393 		if (skb->h.th->syn)
1394 			offset--;
1395 		if (offset < skb->len || skb->h.th->fin) {
1396 			*off = offset;
1397 			return skb;
1398 		}
1399 	}
1400 	return NULL;
1401 }
1402 
1403 /*
1404  * This routine provides an alternative to tcp_recvmsg() for routines
1405  * that would like to handle copying from skbuffs directly in 'sendfile'
1406  * fashion.
1407  * Note:
1408  *	- It is assumed that the socket was locked by the caller.
1409  *	- The routine does not block.
1410  *	- At present, there is no support for reading OOB data
1411  *	  or for 'peeking' the socket using this routine
1412  *	  (although both would be easy to implement).
1413  */
tcp_read_sock(struct sock * sk,read_descriptor_t * desc,sk_read_actor_t recv_actor)1414 int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
1415 		  sk_read_actor_t recv_actor)
1416 {
1417 	struct sk_buff *skb;
1418 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1419 	u32 seq = tp->copied_seq;
1420 	u32 offset;
1421 	int copied = 0;
1422 
1423 	if (sk->state == TCP_LISTEN)
1424 		return -ENOTCONN;
1425 	while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) {
1426 		if (offset < skb->len) {
1427 			size_t used, len;
1428 
1429 			len = skb->len - offset;
1430 			/* Stop reading if we hit a patch of urgent data */
1431 			if (tp->urg_data) {
1432 				u32 urg_offset = tp->urg_seq - seq;
1433 				if (urg_offset < len)
1434 					len = urg_offset;
1435 				if (!len)
1436 					break;
1437 			}
1438 			used = recv_actor(desc, skb, offset, len);
1439 			if (used <= len) {
1440 				seq += used;
1441 				copied += used;
1442 				offset += used;
1443 			}
1444 			if (offset != skb->len)
1445 				break;
1446 		}
1447 		if (skb->h.th->fin) {
1448 			tcp_eat_skb(sk, skb);
1449 			++seq;
1450 			break;
1451 		}
1452 		tcp_eat_skb(sk, skb);
1453 		if (!desc->count)
1454 			break;
1455 	}
1456 	tp->copied_seq = seq;
1457 
1458 	tcp_rcv_space_adjust(sk);
1459 
1460 	/* Clean up data we have read: This will do ACK frames. */
1461 	if (copied)
1462 		cleanup_rbuf(sk, copied);
1463 	return copied;
1464 }
1465 
1466 /*
1467  *	This routine copies from a sock struct into the user buffer.
1468  *
1469  *	Technical note: in 2.3 we work on _locked_ socket, so that
1470  *	tricks with *seq access order and skb->users are not required.
1471  *	Probably, code can be easily improved even more.
1472  */
1473 
tcp_recvmsg(struct sock * sk,struct msghdr * msg,int len,int nonblock,int flags,int * addr_len)1474 int tcp_recvmsg(struct sock *sk, struct msghdr *msg,
1475 		int len, int nonblock, int flags, int *addr_len)
1476 {
1477 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
1478 	int copied = 0;
1479 	u32 peek_seq;
1480 	u32 *seq;
1481 	unsigned long used;
1482 	int err;
1483 	int target;		/* Read at least this many bytes */
1484 	long timeo;
1485 	struct task_struct *user_recv = NULL;
1486 
1487 	lock_sock(sk);
1488 
1489 	TCP_CHECK_TIMER(sk);
1490 
1491 	err = -ENOTCONN;
1492 	if (sk->state == TCP_LISTEN)
1493 		goto out;
1494 
1495 	timeo = sock_rcvtimeo(sk, nonblock);
1496 
1497 	/* Urgent data needs to be handled specially. */
1498 	if (flags & MSG_OOB)
1499 		goto recv_urg;
1500 
1501 	seq = &tp->copied_seq;
1502 	if (flags & MSG_PEEK) {
1503 		peek_seq = tp->copied_seq;
1504 		seq = &peek_seq;
1505 	}
1506 
1507 	target = sock_rcvlowat(sk, flags & MSG_WAITALL, len);
1508 
1509 	do {
1510 		struct sk_buff * skb;
1511 		u32 offset;
1512 
1513 		/* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */
1514 		if (tp->urg_data && tp->urg_seq == *seq) {
1515 			if (copied)
1516 				break;
1517 			if (signal_pending(current)) {
1518 				copied = timeo ? sock_intr_errno(timeo) : -EAGAIN;
1519 				break;
1520 			}
1521 		}
1522 
1523 		/* Next get a buffer. */
1524 
1525 		skb = skb_peek(&sk->receive_queue);
1526 		do {
1527 			if (!skb)
1528 				break;
1529 
1530 			/* Now that we have two receive queues this
1531 			 * shouldn't happen.
1532 			 */
1533 			if (before(*seq, TCP_SKB_CB(skb)->seq)) {
1534 				printk(KERN_INFO "recvmsg bug: copied %X seq %X\n",
1535 				       *seq, TCP_SKB_CB(skb)->seq);
1536 				break;
1537 			}
1538 			offset = *seq - TCP_SKB_CB(skb)->seq;
1539 			if (skb->h.th->syn)
1540 				offset--;
1541 			if (offset < skb->len)
1542 				goto found_ok_skb;
1543 			if (skb->h.th->fin)
1544 				goto found_fin_ok;
1545 			BUG_TRAP(flags&MSG_PEEK);
1546 			skb = skb->next;
1547 		} while (skb != (struct sk_buff *)&sk->receive_queue);
1548 
1549 		/* Well, if we have backlog, try to process it now yet. */
1550 
1551 		if (copied >= target && sk->backlog.tail == NULL)
1552 			break;
1553 
1554 		if (copied) {
1555 			if (sk->err ||
1556 			    sk->state == TCP_CLOSE ||
1557 			    (sk->shutdown & RCV_SHUTDOWN) ||
1558 			    !timeo ||
1559 			    signal_pending(current) ||
1560 			    (flags & MSG_PEEK))
1561 				break;
1562 		} else {
1563 			if (sk->done)
1564 				break;
1565 
1566 			if (sk->err) {
1567 				copied = sock_error(sk);
1568 				break;
1569 			}
1570 
1571 			if (sk->shutdown & RCV_SHUTDOWN)
1572 				break;
1573 
1574 			if (sk->state == TCP_CLOSE) {
1575 				if (!sk->done) {
1576 					/* This occurs when user tries to read
1577 					 * from never connected socket.
1578 					 */
1579 					copied = -ENOTCONN;
1580 					break;
1581 				}
1582 				break;
1583 			}
1584 
1585 			if (!timeo) {
1586 				copied = -EAGAIN;
1587 				break;
1588 			}
1589 
1590 			if (signal_pending(current)) {
1591 				copied = sock_intr_errno(timeo);
1592 				break;
1593 			}
1594 		}
1595 
1596 		cleanup_rbuf(sk, copied);
1597 
1598 		if (tp->ucopy.task == user_recv) {
1599 			/* Install new reader */
1600 			if (user_recv == NULL && !(flags&(MSG_TRUNC|MSG_PEEK))) {
1601 				user_recv = current;
1602 				tp->ucopy.task = user_recv;
1603 				tp->ucopy.iov = msg->msg_iov;
1604 			}
1605 
1606 			tp->ucopy.len = len;
1607 
1608 			BUG_TRAP(tp->copied_seq == tp->rcv_nxt || (flags&(MSG_PEEK|MSG_TRUNC)));
1609 
1610 			/* Ugly... If prequeue is not empty, we have to
1611 			 * process it before releasing socket, otherwise
1612 			 * order will be broken at second iteration.
1613 			 * More elegant solution is required!!!
1614 			 *
1615 			 * Look: we have the following (pseudo)queues:
1616 			 *
1617 			 * 1. packets in flight
1618 			 * 2. backlog
1619 			 * 3. prequeue
1620 			 * 4. receive_queue
1621 			 *
1622 			 * Each queue can be processed only if the next ones
1623 			 * are empty. At this point we have empty receive_queue.
1624 			 * But prequeue _can_ be not empty after second iteration,
1625 			 * when we jumped to start of loop because backlog
1626 			 * processing added something to receive_queue.
1627 			 * We cannot release_sock(), because backlog contains
1628 			 * packets arrived _after_ prequeued ones.
1629 			 *
1630 			 * Shortly, algorithm is clear --- to process all
1631 			 * the queues in order. We could make it more directly,
1632 			 * requeueing packets from backlog to prequeue, if
1633 			 * is not empty. It is more elegant, but eats cycles,
1634 			 * unfortunately.
1635 			 */
1636 			if (skb_queue_len(&tp->ucopy.prequeue))
1637 				goto do_prequeue;
1638 
1639 			/* __ Set realtime policy in scheduler __ */
1640 		}
1641 
1642 		if (copied >= target) {
1643 			/* Do not sleep, just process backlog. */
1644 			release_sock(sk);
1645 			lock_sock(sk);
1646 		} else {
1647 			timeo = tcp_data_wait(sk, timeo);
1648 		}
1649 
1650 		if (user_recv) {
1651 			int chunk;
1652 
1653 			/* __ Restore normal policy in scheduler __ */
1654 
1655 			if ((chunk = len - tp->ucopy.len) != 0) {
1656 				net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromBacklog += chunk;
1657 				len -= chunk;
1658 				copied += chunk;
1659 			}
1660 
1661 			if (tp->rcv_nxt == tp->copied_seq &&
1662 			    skb_queue_len(&tp->ucopy.prequeue)) {
1663 do_prequeue:
1664 				tcp_prequeue_process(sk);
1665 
1666 				if ((chunk = len - tp->ucopy.len) != 0) {
1667 					net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1668 					len -= chunk;
1669 					copied += chunk;
1670 				}
1671 			}
1672 		}
1673 		if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) {
1674 			if (net_ratelimit())
1675 				printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n",
1676 				       current->comm, current->pid);
1677 			peek_seq = tp->copied_seq;
1678 		}
1679 		continue;
1680 
1681 	found_ok_skb:
1682 		/* Ok so how much can we use? */
1683 		used = skb->len - offset;
1684 		if (len < used)
1685 			used = len;
1686 
1687 		/* Do we have urgent data here? */
1688 		if (tp->urg_data) {
1689 			u32 urg_offset = tp->urg_seq - *seq;
1690 			if (urg_offset < used) {
1691 				if (!urg_offset) {
1692 					if (!sk->urginline) {
1693 						++*seq;
1694 						offset++;
1695 						used--;
1696 						if (!used)
1697 							goto skip_copy;
1698 					}
1699 				} else
1700 					used = urg_offset;
1701 			}
1702 		}
1703 
1704 		if (!(flags&MSG_TRUNC)) {
1705 			err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, used);
1706 			if (err) {
1707 				/* Exception. Bailout! */
1708 				if (!copied)
1709 					copied = -EFAULT;
1710 				break;
1711 			}
1712 		}
1713 
1714 		*seq += used;
1715 		copied += used;
1716 		len -= used;
1717 
1718 		tcp_rcv_space_adjust(sk);
1719 
1720 skip_copy:
1721 		if (tp->urg_data && after(tp->copied_seq,tp->urg_seq)) {
1722 			tp->urg_data = 0;
1723 			tcp_fast_path_check(sk, tp);
1724 		}
1725 		if (used + offset < skb->len)
1726 			continue;
1727 
1728 		if (skb->h.th->fin)
1729 			goto found_fin_ok;
1730 		if (!(flags & MSG_PEEK))
1731 			tcp_eat_skb(sk, skb);
1732 		continue;
1733 
1734 	found_fin_ok:
1735 		/* Process the FIN. */
1736 		++*seq;
1737 		if (!(flags & MSG_PEEK))
1738 			tcp_eat_skb(sk, skb);
1739 		break;
1740 	} while (len > 0);
1741 
1742 	if (user_recv) {
1743 		if (skb_queue_len(&tp->ucopy.prequeue)) {
1744 			int chunk;
1745 
1746 			tp->ucopy.len = copied > 0 ? len : 0;
1747 
1748 			tcp_prequeue_process(sk);
1749 
1750 			if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) {
1751 				net_statistics[smp_processor_id()*2+1].TCPDirectCopyFromPrequeue += chunk;
1752 				len -= chunk;
1753 				copied += chunk;
1754 			}
1755 		}
1756 
1757 		tp->ucopy.task = NULL;
1758 		tp->ucopy.len = 0;
1759 	}
1760 
1761 	/* According to UNIX98, msg_name/msg_namelen are ignored
1762 	 * on connected socket. I was just happy when found this 8) --ANK
1763 	 */
1764 
1765 	/* Clean up data we have read: This will do ACK frames. */
1766 	cleanup_rbuf(sk, copied);
1767 
1768 	TCP_CHECK_TIMER(sk);
1769 	release_sock(sk);
1770 	return copied;
1771 
1772 out:
1773 	TCP_CHECK_TIMER(sk);
1774 	release_sock(sk);
1775 	return err;
1776 
1777 recv_urg:
1778 	err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len);
1779 	goto out;
1780 }
1781 
1782 /*
1783  *	State processing on a close. This implements the state shift for
1784  *	sending our FIN frame. Note that we only send a FIN for some
1785  *	states. A shutdown() may have already sent the FIN, or we may be
1786  *	closed.
1787  */
1788 
1789 static unsigned char new_state[16] = {
1790   /* current state:        new state:      action:	*/
1791   /* (Invalid)		*/ TCP_CLOSE,
1792   /* TCP_ESTABLISHED	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1793   /* TCP_SYN_SENT	*/ TCP_CLOSE,
1794   /* TCP_SYN_RECV	*/ TCP_FIN_WAIT1 | TCP_ACTION_FIN,
1795   /* TCP_FIN_WAIT1	*/ TCP_FIN_WAIT1,
1796   /* TCP_FIN_WAIT2	*/ TCP_FIN_WAIT2,
1797   /* TCP_TIME_WAIT	*/ TCP_CLOSE,
1798   /* TCP_CLOSE		*/ TCP_CLOSE,
1799   /* TCP_CLOSE_WAIT	*/ TCP_LAST_ACK  | TCP_ACTION_FIN,
1800   /* TCP_LAST_ACK	*/ TCP_LAST_ACK,
1801   /* TCP_LISTEN		*/ TCP_CLOSE,
1802   /* TCP_CLOSING	*/ TCP_CLOSING,
1803 };
1804 
tcp_close_state(struct sock * sk)1805 static int tcp_close_state(struct sock *sk)
1806 {
1807 	int next = (int) new_state[sk->state];
1808 	int ns = (next & TCP_STATE_MASK);
1809 
1810 	tcp_set_state(sk, ns);
1811 
1812 	return (next & TCP_ACTION_FIN);
1813 }
1814 
1815 /*
1816  *	Shutdown the sending side of a connection. Much like close except
1817  *	that we don't receive shut down or set sk->dead.
1818  */
1819 
tcp_shutdown(struct sock * sk,int how)1820 void tcp_shutdown(struct sock *sk, int how)
1821 {
1822 	/*	We need to grab some memory, and put together a FIN,
1823 	 *	and then put it into the queue to be sent.
1824 	 *		Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92.
1825 	 */
1826 	if (!(how & SEND_SHUTDOWN))
1827 		return;
1828 
1829 	/* If we've already sent a FIN, or it's a closed state, skip this. */
1830 	if ((1 << sk->state) &
1831 	    (TCPF_ESTABLISHED|TCPF_SYN_SENT|TCPF_SYN_RECV|TCPF_CLOSE_WAIT)) {
1832 		/* Clear out any half completed packets.  FIN if needed. */
1833 		if (tcp_close_state(sk))
1834 			tcp_send_fin(sk);
1835 	}
1836 }
1837 
1838 
1839 /*
1840  *	Return 1 if we still have things to send in our buffers.
1841  */
1842 
closing(struct sock * sk)1843 static inline int closing(struct sock * sk)
1844 {
1845 	return ((1 << sk->state) & (TCPF_FIN_WAIT1|TCPF_CLOSING|TCPF_LAST_ACK));
1846 }
1847 
tcp_kill_sk_queues(struct sock * sk)1848 static __inline__ void tcp_kill_sk_queues(struct sock *sk)
1849 {
1850 	/* First the read buffer. */
1851 	__skb_queue_purge(&sk->receive_queue);
1852 
1853 	/* Next, the error queue. */
1854 	__skb_queue_purge(&sk->error_queue);
1855 
1856 	/* Next, the write queue. */
1857 	BUG_TRAP(skb_queue_empty(&sk->write_queue));
1858 
1859 	/* Account for returned memory. */
1860 	tcp_mem_reclaim(sk);
1861 
1862 	BUG_TRAP(sk->wmem_queued == 0);
1863 	BUG_TRAP(sk->forward_alloc == 0);
1864 
1865 	/* It is _impossible_ for the backlog to contain anything
1866 	 * when we get here.  All user references to this socket
1867 	 * have gone away, only the net layer knows can touch it.
1868 	 */
1869 }
1870 
1871 /*
1872  * At this point, there should be no process reference to this
1873  * socket, and thus no user references at all.  Therefore we
1874  * can assume the socket waitqueue is inactive and nobody will
1875  * try to jump onto it.
1876  */
tcp_destroy_sock(struct sock * sk)1877 void tcp_destroy_sock(struct sock *sk)
1878 {
1879 	BUG_TRAP(sk->state==TCP_CLOSE);
1880 	BUG_TRAP(sk->dead);
1881 
1882 	/* It cannot be in hash table! */
1883 	BUG_TRAP(sk->pprev==NULL);
1884 
1885 	/* If it has not 0 sk->num, it must be bound */
1886 	BUG_TRAP(!sk->num || sk->prev!=NULL);
1887 
1888 #ifdef TCP_DEBUG
1889 	if (sk->zapped) {
1890 		printk(KERN_DEBUG "TCP: double destroy sk=%p\n", sk);
1891 		sock_hold(sk);
1892 	}
1893 	sk->zapped = 1;
1894 #endif
1895 
1896 	sk->prot->destroy(sk);
1897 
1898 	tcp_kill_sk_queues(sk);
1899 
1900 #ifdef INET_REFCNT_DEBUG
1901 	if (atomic_read(&sk->refcnt) != 1) {
1902 		printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", sk, atomic_read(&sk->refcnt));
1903 	}
1904 #endif
1905 
1906 	atomic_dec(&tcp_orphan_count);
1907 	sock_put(sk);
1908 }
1909 
tcp_close(struct sock * sk,long timeout)1910 void tcp_close(struct sock *sk, long timeout)
1911 {
1912 	struct sk_buff *skb;
1913 	int data_was_unread = 0;
1914 
1915 	lock_sock(sk);
1916 	sk->shutdown = SHUTDOWN_MASK;
1917 
1918 	if(sk->state == TCP_LISTEN) {
1919 		tcp_set_state(sk, TCP_CLOSE);
1920 
1921 		/* Special case. */
1922 		tcp_listen_stop(sk);
1923 
1924 		goto adjudge_to_death;
1925 	}
1926 
1927 	/*  We need to flush the recv. buffs.  We do this only on the
1928 	 *  descriptor close, not protocol-sourced closes, because the
1929 	 *  reader process may not have drained the data yet!
1930 	 */
1931 	while((skb=__skb_dequeue(&sk->receive_queue))!=NULL) {
1932 		u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - skb->h.th->fin;
1933 		data_was_unread += len;
1934 		__kfree_skb(skb);
1935 	}
1936 
1937 	tcp_mem_reclaim(sk);
1938 
1939 	/* As outlined in draft-ietf-tcpimpl-prob-03.txt, section
1940 	 * 3.10, we send a RST here because data was lost.  To
1941 	 * witness the awful effects of the old behavior of always
1942 	 * doing a FIN, run an older 2.1.x kernel or 2.0.x, start
1943 	 * a bulk GET in an FTP client, suspend the process, wait
1944 	 * for the client to advertise a zero window, then kill -9
1945 	 * the FTP client, wheee...  Note: timeout is always zero
1946 	 * in such a case.
1947 	 */
1948 	if(data_was_unread != 0) {
1949 		/* Unread data was tossed, zap the connection. */
1950 		NET_INC_STATS_USER(TCPAbortOnClose);
1951 		tcp_set_state(sk, TCP_CLOSE);
1952 		tcp_send_active_reset(sk, GFP_KERNEL);
1953 	} else if (sk->linger && sk->lingertime==0) {
1954 		/* Check zero linger _after_ checking for unread data. */
1955 		sk->prot->disconnect(sk, 0);
1956 		NET_INC_STATS_USER(TCPAbortOnData);
1957 	} else if (tcp_close_state(sk)) {
1958 		/* We FIN if the application ate all the data before
1959 		 * zapping the connection.
1960 		 */
1961 
1962 		/* RED-PEN. Formally speaking, we have broken TCP state
1963 		 * machine. State transitions:
1964 		 *
1965 		 * TCP_ESTABLISHED -> TCP_FIN_WAIT1
1966 		 * TCP_SYN_RECV	-> TCP_FIN_WAIT1 (forget it, it's impossible)
1967 		 * TCP_CLOSE_WAIT -> TCP_LAST_ACK
1968 		 *
1969 		 * are legal only when FIN has been sent (i.e. in window),
1970 		 * rather than queued out of window. Purists blame.
1971 		 *
1972 		 * F.e. "RFC state" is ESTABLISHED,
1973 		 * if Linux state is FIN-WAIT-1, but FIN is still not sent.
1974 		 *
1975 		 * The visible declinations are that sometimes
1976 		 * we enter time-wait state, when it is not required really
1977 		 * (harmless), do not send active resets, when they are
1978 		 * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when
1979 		 * they look as CLOSING or LAST_ACK for Linux)
1980 		 * Probably, I missed some more holelets.
1981 		 * 						--ANK
1982 		 */
1983 		tcp_send_fin(sk);
1984 	}
1985 
1986 	if (timeout) {
1987 		struct task_struct *tsk = current;
1988 		DECLARE_WAITQUEUE(wait, current);
1989 
1990 		add_wait_queue(sk->sleep, &wait);
1991 
1992 		do {
1993 			set_current_state(TASK_INTERRUPTIBLE);
1994 			if (!closing(sk))
1995 				break;
1996 			release_sock(sk);
1997 			timeout = schedule_timeout(timeout);
1998 			lock_sock(sk);
1999 		} while (!signal_pending(tsk) && timeout);
2000 
2001 		tsk->state = TASK_RUNNING;
2002 		remove_wait_queue(sk->sleep, &wait);
2003 	}
2004 
2005 adjudge_to_death:
2006 	/* It is the last release_sock in its life. It will remove backlog. */
2007 	release_sock(sk);
2008 
2009 
2010 	/* Now socket is owned by kernel and we acquire BH lock
2011 	   to finish close. No need to check for user refs.
2012 	 */
2013 	local_bh_disable();
2014 	bh_lock_sock(sk);
2015 	BUG_TRAP(sk->lock.users==0);
2016 
2017 	sock_hold(sk);
2018 	sock_orphan(sk);
2019 
2020 	/*	This is a (useful) BSD violating of the RFC. There is a
2021 	 *	problem with TCP as specified in that the other end could
2022 	 *	keep a socket open forever with no application left this end.
2023 	 *	We use a 3 minute timeout (about the same as BSD) then kill
2024 	 *	our end. If they send after that then tough - BUT: long enough
2025 	 *	that we won't make the old 4*rto = almost no time - whoops
2026 	 *	reset mistake.
2027 	 *
2028 	 *	Nope, it was not mistake. It is really desired behaviour
2029 	 *	f.e. on http servers, when such sockets are useless, but
2030 	 *	consume significant resources. Let's do it with special
2031 	 *	linger2	option.					--ANK
2032 	 */
2033 
2034 	if (sk->state == TCP_FIN_WAIT2) {
2035 		struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2036 		if (tp->linger2 < 0) {
2037 			tcp_set_state(sk, TCP_CLOSE);
2038 			tcp_send_active_reset(sk, GFP_ATOMIC);
2039 			NET_INC_STATS_BH(TCPAbortOnLinger);
2040 		} else {
2041 			int tmo = tcp_fin_time(tp);
2042 
2043 			if (tmo > TCP_TIMEWAIT_LEN) {
2044 				tcp_reset_keepalive_timer(sk, tcp_fin_time(tp));
2045 			} else {
2046 				atomic_inc(&tcp_orphan_count);
2047 				tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
2048 				goto out;
2049 			}
2050 		}
2051 	}
2052 	if (sk->state != TCP_CLOSE) {
2053 		tcp_mem_reclaim(sk);
2054 		if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans ||
2055 		    (sk->wmem_queued > SOCK_MIN_SNDBUF &&
2056 		     atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
2057 			if (net_ratelimit())
2058 				printk(KERN_INFO "TCP: too many of orphaned sockets\n");
2059 			tcp_set_state(sk, TCP_CLOSE);
2060 			tcp_send_active_reset(sk, GFP_ATOMIC);
2061 			NET_INC_STATS_BH(TCPAbortOnMemory);
2062 		}
2063 	}
2064 	atomic_inc(&tcp_orphan_count);
2065 
2066 	if (sk->state == TCP_CLOSE)
2067 		tcp_destroy_sock(sk);
2068 	/* Otherwise, socket is reprieved until protocol close. */
2069 
2070 out:
2071 	bh_unlock_sock(sk);
2072 	local_bh_enable();
2073 	sock_put(sk);
2074 }
2075 
2076 /* These states need RST on ABORT according to RFC793 */
2077 
tcp_need_reset(int state)2078 static inline int tcp_need_reset(int state)
2079 {
2080 	return ((1 << state) &
2081 	       	(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT|TCPF_FIN_WAIT1|
2082 		 TCPF_FIN_WAIT2|TCPF_SYN_RECV));
2083 }
2084 
tcp_disconnect(struct sock * sk,int flags)2085 int tcp_disconnect(struct sock *sk, int flags)
2086 {
2087 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2088 	int old_state;
2089 	int err = 0;
2090 
2091 	old_state = sk->state;
2092 	if (old_state != TCP_CLOSE)
2093 		tcp_set_state(sk, TCP_CLOSE);
2094 
2095 	/* ABORT function of RFC793 */
2096 	if (old_state == TCP_LISTEN) {
2097 		tcp_listen_stop(sk);
2098 	} else if (tcp_need_reset(old_state) ||
2099 		   (tp->snd_nxt != tp->write_seq &&
2100 		    (1<<old_state)&(TCPF_CLOSING|TCPF_LAST_ACK))) {
2101 		/* The last check adjusts for discrepance of Linux wrt. RFC
2102 		 * states
2103 		 */
2104 		tcp_send_active_reset(sk, gfp_any());
2105 		sk->err = ECONNRESET;
2106 	} else if (old_state == TCP_SYN_SENT)
2107 		sk->err = ECONNRESET;
2108 
2109 	tcp_clear_xmit_timers(sk);
2110 	__skb_queue_purge(&sk->receive_queue);
2111   	tcp_writequeue_purge(sk);
2112   	__skb_queue_purge(&tp->out_of_order_queue);
2113 
2114 	sk->dport = 0;
2115 
2116 	if (!(sk->userlocks&SOCK_BINDADDR_LOCK)) {
2117 		sk->rcv_saddr = 0;
2118 		sk->saddr = 0;
2119 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
2120 		memset(&sk->net_pinfo.af_inet6.saddr, 0, 16);
2121 		memset(&sk->net_pinfo.af_inet6.rcv_saddr, 0, 16);
2122 #endif
2123 	}
2124 
2125 	sk->shutdown = 0;
2126 	sk->done = 0;
2127 	tp->srtt = 0;
2128 	if ((tp->write_seq += tp->max_window+2) == 0)
2129 		tp->write_seq = 1;
2130 	tp->backoff = 0;
2131 	tp->snd_cwnd = 2;
2132 	tp->probes_out = 0;
2133 	tp->packets_out = 0;
2134 	tp->snd_ssthresh = 0x7fffffff;
2135 	tp->snd_cwnd_cnt = 0;
2136 	tcp_set_ca_state(tp, TCP_CA_Open);
2137 	tcp_clear_retrans(tp);
2138 	tcp_delack_init(tp);
2139 	tp->send_head = NULL;
2140 	tp->saw_tstamp = 0;
2141 	tcp_sack_reset(tp);
2142 	__sk_dst_reset(sk);
2143 
2144 	BUG_TRAP(!sk->num || sk->prev);
2145 
2146 	sk->error_report(sk);
2147 	return err;
2148 }
2149 
2150 /*
2151  *	Wait for an incoming connection, avoid race
2152  *	conditions. This must be called with the socket locked.
2153  */
wait_for_connect(struct sock * sk,long timeo)2154 static int wait_for_connect(struct sock * sk, long timeo)
2155 {
2156 	DECLARE_WAITQUEUE(wait, current);
2157 	int err;
2158 
2159 	/*
2160 	 * True wake-one mechanism for incoming connections: only
2161 	 * one process gets woken up, not the 'whole herd'.
2162 	 * Since we do not 'race & poll' for established sockets
2163 	 * anymore, the common case will execute the loop only once.
2164 	 *
2165 	 * Subtle issue: "add_wait_queue_exclusive()" will be added
2166 	 * after any current non-exclusive waiters, and we know that
2167 	 * it will always _stay_ after any new non-exclusive waiters
2168 	 * because all non-exclusive waiters are added at the
2169 	 * beginning of the wait-queue. As such, it's ok to "drop"
2170 	 * our exclusiveness temporarily when we get woken up without
2171 	 * having to remove and re-insert us on the wait queue.
2172 	 */
2173 	add_wait_queue_exclusive(sk->sleep, &wait);
2174 	for (;;) {
2175 		current->state = TASK_INTERRUPTIBLE;
2176 		release_sock(sk);
2177 		if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
2178 			timeo = schedule_timeout(timeo);
2179 		lock_sock(sk);
2180 		err = 0;
2181 		if (sk->tp_pinfo.af_tcp.accept_queue)
2182 			break;
2183 		err = -EINVAL;
2184 		if (sk->state != TCP_LISTEN)
2185 			break;
2186 		err = sock_intr_errno(timeo);
2187 		if (signal_pending(current))
2188 			break;
2189 		err = -EAGAIN;
2190 		if (!timeo)
2191 			break;
2192 	}
2193 	current->state = TASK_RUNNING;
2194 	remove_wait_queue(sk->sleep, &wait);
2195 	return err;
2196 }
2197 
2198 /*
2199  *	This will accept the next outstanding connection.
2200  */
2201 
tcp_accept(struct sock * sk,int flags,int * err)2202 struct sock *tcp_accept(struct sock *sk, int flags, int *err)
2203 {
2204 	struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
2205 	struct open_request *req;
2206 	struct sock *newsk;
2207 	int error;
2208 
2209 	lock_sock(sk);
2210 
2211 	/* We need to make sure that this socket is listening,
2212 	 * and that it has something pending.
2213 	 */
2214 	error = -EINVAL;
2215 	if (sk->state != TCP_LISTEN)
2216 		goto out;
2217 
2218 	/* Find already established connection */
2219 	if (!tp->accept_queue) {
2220 		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2221 
2222 		/* If this is a non blocking socket don't sleep */
2223 		error = -EAGAIN;
2224 		if (!timeo)
2225 			goto out;
2226 
2227 		error = wait_for_connect(sk, timeo);
2228 		if (error)
2229 			goto out;
2230 	}
2231 
2232 	req = tp->accept_queue;
2233 	if ((tp->accept_queue = req->dl_next) == NULL)
2234 		tp->accept_queue_tail = NULL;
2235 
2236  	newsk = req->sk;
2237 	tcp_acceptq_removed(sk);
2238 	tcp_openreq_fastfree(req);
2239 	BUG_TRAP(newsk->state != TCP_SYN_RECV);
2240 	release_sock(sk);
2241 	return newsk;
2242 
2243 out:
2244 	release_sock(sk);
2245 	*err = error;
2246 	return NULL;
2247 }
2248 
2249 /*
2250  *	Socket option code for TCP.
2251  */
2252 
tcp_setsockopt(struct sock * sk,int level,int optname,char * optval,int optlen)2253 int tcp_setsockopt(struct sock *sk, int level, int optname, char *optval,
2254 		   int optlen)
2255 {
2256 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2257 	int val;
2258 	int err = 0;
2259 
2260 	if (level != SOL_TCP)
2261 		return tp->af_specific->setsockopt(sk, level, optname,
2262 						   optval, optlen);
2263 
2264 	if(optlen<sizeof(int))
2265 		return -EINVAL;
2266 
2267 	if (get_user(val, (int *)optval))
2268 		return -EFAULT;
2269 
2270 	lock_sock(sk);
2271 
2272 	switch(optname) {
2273 	case TCP_MAXSEG:
2274 		/* values greater than interface MTU won't take effect.  however at
2275 		 * the point when this call is done we typically don't yet know
2276 		 * which interface is going to be used
2277 		 */
2278 		if(val < 8 || val > MAX_TCP_WINDOW) {
2279 			err = -EINVAL;
2280 			break;
2281 		}
2282 		tp->user_mss = val;
2283 		break;
2284 
2285 	case TCP_NODELAY:
2286 		/* You cannot try to use this and TCP_CORK in
2287 		 * tandem, so let the user know.
2288 		 */
2289 		if (tp->nonagle == 2) {
2290 			err = -EINVAL;
2291 			break;
2292 		}
2293 		tp->nonagle = (val == 0) ? 0 : 1;
2294 		if (val)
2295 			tcp_push_pending_frames(sk, tp);
2296 		break;
2297 
2298 	case TCP_CORK:
2299 		/* When set indicates to always queue non-full frames.
2300 		 * Later the user clears this option and we transmit
2301 		 * any pending partial frames in the queue.  This is
2302 		 * meant to be used alongside sendfile() to get properly
2303 		 * filled frames when the user (for example) must write
2304 		 * out headers with a write() call first and then use
2305 		 * sendfile to send out the data parts.
2306 		 *
2307 		 * You cannot try to use TCP_NODELAY and this mechanism
2308 		 * at the same time, so let the user know.
2309 		 */
2310 		if (tp->nonagle == 1) {
2311 			err = -EINVAL;
2312 			break;
2313 		}
2314 		if (val != 0) {
2315 			tp->nonagle = 2;
2316 		} else {
2317 			tp->nonagle = 0;
2318 
2319 			tcp_push_pending_frames(sk, tp);
2320 		}
2321 		break;
2322 
2323 	case TCP_KEEPIDLE:
2324 		if (val < 1 || val > MAX_TCP_KEEPIDLE)
2325 			err = -EINVAL;
2326 		else {
2327 			tp->keepalive_time = val * HZ;
2328 			if (sk->keepopen && !((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN))) {
2329 				__u32 elapsed = tcp_time_stamp - tp->rcv_tstamp;
2330 				if (tp->keepalive_time > elapsed)
2331 					elapsed = tp->keepalive_time - elapsed;
2332 				else
2333 					elapsed = 0;
2334 				tcp_reset_keepalive_timer(sk, elapsed);
2335 			}
2336 		}
2337 		break;
2338 	case TCP_KEEPINTVL:
2339 		if (val < 1 || val > MAX_TCP_KEEPINTVL)
2340 			err = -EINVAL;
2341 		else
2342 			tp->keepalive_intvl = val * HZ;
2343 		break;
2344 	case TCP_KEEPCNT:
2345 		if (val < 1 || val > MAX_TCP_KEEPCNT)
2346 			err = -EINVAL;
2347 		else
2348 			tp->keepalive_probes = val;
2349 		break;
2350 	case TCP_SYNCNT:
2351 		if (val < 1 || val > MAX_TCP_SYNCNT)
2352 			err = -EINVAL;
2353 		else
2354 			tp->syn_retries = val;
2355 		break;
2356 
2357 	case TCP_LINGER2:
2358 		if (val < 0)
2359 			tp->linger2 = -1;
2360 		else if (val > sysctl_tcp_fin_timeout/HZ)
2361 			tp->linger2 = 0;
2362 		else
2363 			tp->linger2 = val*HZ;
2364 		break;
2365 
2366 	case TCP_DEFER_ACCEPT:
2367 		tp->defer_accept = 0;
2368 		if (val > 0) {
2369 			/* Translate value in seconds to number of retransmits */
2370 			while (tp->defer_accept < 32 && val > ((TCP_TIMEOUT_INIT/HZ)<<tp->defer_accept))
2371 				tp->defer_accept++;
2372 			tp->defer_accept++;
2373 		}
2374 		break;
2375 
2376 	case TCP_WINDOW_CLAMP:
2377 		if (val==0) {
2378 			if (sk->state != TCP_CLOSE) {
2379 				err = -EINVAL;
2380 				break;
2381 			}
2382 			tp->window_clamp = 0;
2383 		} else {
2384 			tp->window_clamp = val<SOCK_MIN_RCVBUF/2 ?
2385 				SOCK_MIN_RCVBUF/2 : val;
2386 		}
2387 		break;
2388 
2389 	case TCP_QUICKACK:
2390 		if (!val) {
2391 			tp->ack.pingpong = 1;
2392 		} else {
2393 			tp->ack.pingpong = 0;
2394 			if ((1<<sk->state)&(TCPF_ESTABLISHED|TCPF_CLOSE_WAIT) &&
2395 			    tcp_ack_scheduled(tp)) {
2396 				tp->ack.pending |= TCP_ACK_PUSHED;
2397 				cleanup_rbuf(sk, 1);
2398 				if (!(val & 1))
2399 					tp->ack.pingpong = 1;
2400 			}
2401 		}
2402 		break;
2403 
2404 	default:
2405 		err = -ENOPROTOOPT;
2406 		break;
2407 	};
2408 	release_sock(sk);
2409 	return err;
2410 }
2411 
tcp_getsockopt(struct sock * sk,int level,int optname,char * optval,int * optlen)2412 int tcp_getsockopt(struct sock *sk, int level, int optname, char *optval,
2413 		   int *optlen)
2414 {
2415 	struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
2416 	int val, len;
2417 
2418 	if(level != SOL_TCP)
2419 		return tp->af_specific->getsockopt(sk, level, optname,
2420 						   optval, optlen);
2421 
2422 	if(get_user(len,optlen))
2423 		return -EFAULT;
2424 
2425 	len = min_t(unsigned int, len, sizeof(int));
2426 
2427 	if(len < 0)
2428 		return -EINVAL;
2429 
2430 	switch(optname) {
2431 	case TCP_MAXSEG:
2432 		val = tp->mss_cache;
2433 		if (val == 0 && ((1<<sk->state)&(TCPF_CLOSE|TCPF_LISTEN)))
2434 			val = tp->user_mss;
2435 		break;
2436 	case TCP_NODELAY:
2437 		val = (tp->nonagle == 1);
2438 		break;
2439 	case TCP_CORK:
2440 		val = (tp->nonagle == 2);
2441 		break;
2442 	case TCP_KEEPIDLE:
2443 		val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time)/HZ;
2444 		break;
2445 	case TCP_KEEPINTVL:
2446 		val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl)/HZ;
2447 		break;
2448 	case TCP_KEEPCNT:
2449 		val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes;
2450 		break;
2451 	case TCP_SYNCNT:
2452 		val = tp->syn_retries ? : sysctl_tcp_syn_retries;
2453 		break;
2454 	case TCP_LINGER2:
2455 		val = tp->linger2;
2456 		if (val >= 0)
2457 			val = (val ? : sysctl_tcp_fin_timeout)/HZ;
2458 		break;
2459 	case TCP_DEFER_ACCEPT:
2460 		val = tp->defer_accept == 0 ? 0 : ((TCP_TIMEOUT_INIT/HZ)<<(tp->defer_accept-1));
2461 		break;
2462 	case TCP_WINDOW_CLAMP:
2463 		val = tp->window_clamp;
2464 		break;
2465 	case TCP_INFO:
2466 	{
2467 		struct tcp_info info;
2468 		u32 now = tcp_time_stamp;
2469 
2470 		if(get_user(len,optlen))
2471 			return -EFAULT;
2472 		info.tcpi_state = sk->state;
2473 		info.tcpi_ca_state = tp->ca_state;
2474 		info.tcpi_retransmits = tp->retransmits;
2475 		info.tcpi_probes = tp->probes_out;
2476 		info.tcpi_backoff = tp->backoff;
2477 		info.tcpi_options = 0;
2478 		if (tp->tstamp_ok)
2479 			info.tcpi_options |= TCPI_OPT_TIMESTAMPS;
2480 		if (tp->sack_ok)
2481 			info.tcpi_options |= TCPI_OPT_SACK;
2482 		if (tp->wscale_ok) {
2483 			info.tcpi_options |= TCPI_OPT_WSCALE;
2484 			info.tcpi_snd_wscale = tp->snd_wscale;
2485 			info.tcpi_rcv_wscale = tp->rcv_wscale;
2486 		} else {
2487 			info.tcpi_snd_wscale = 0;
2488 			info.tcpi_rcv_wscale = 0;
2489 		}
2490 		if (tp->ecn_flags&TCP_ECN_OK)
2491 			info.tcpi_options |= TCPI_OPT_ECN;
2492 
2493 		info.tcpi_rto = (1000000*tp->rto)/HZ;
2494 		info.tcpi_ato = (1000000*tp->ack.ato)/HZ;
2495 		info.tcpi_snd_mss = tp->mss_cache;
2496 		info.tcpi_rcv_mss = tp->ack.rcv_mss;
2497 
2498 		info.tcpi_unacked = tp->packets_out;
2499 		info.tcpi_sacked = tp->sacked_out;
2500 		info.tcpi_lost = tp->lost_out;
2501 		info.tcpi_retrans = tp->retrans_out;
2502 		info.tcpi_fackets = tp->fackets_out;
2503 
2504 		info.tcpi_last_data_sent = ((now - tp->lsndtime)*1000)/HZ;
2505 		info.tcpi_last_ack_sent = 0;
2506 		info.tcpi_last_data_recv = ((now - tp->ack.lrcvtime)*1000)/HZ;
2507 		info.tcpi_last_ack_recv = ((now - tp->rcv_tstamp)*1000)/HZ;
2508 
2509 		info.tcpi_pmtu = tp->pmtu_cookie;
2510 		info.tcpi_rcv_ssthresh = tp->rcv_ssthresh;
2511 		info.tcpi_rtt = ((1000000*tp->srtt)/HZ)>>3;
2512 		info.tcpi_rttvar = ((1000000*tp->mdev)/HZ)>>2;
2513 		info.tcpi_snd_ssthresh = tp->snd_ssthresh;
2514 		info.tcpi_snd_cwnd = tp->snd_cwnd;
2515 		info.tcpi_advmss = tp->advmss;
2516 		info.tcpi_reordering = tp->reordering;
2517 
2518 		len = min_t(unsigned int, len, sizeof(info));
2519 		if(put_user(len, optlen))
2520 			return -EFAULT;
2521 		if(copy_to_user(optval, &info,len))
2522 			return -EFAULT;
2523 		return 0;
2524 	}
2525 	case TCP_QUICKACK:
2526 		val = !tp->ack.pingpong;
2527 		break;
2528 	default:
2529 		return -ENOPROTOOPT;
2530 	};
2531 
2532   	if(put_user(len, optlen))
2533   		return -EFAULT;
2534 	if(copy_to_user(optval, &val,len))
2535 		return -EFAULT;
2536   	return 0;
2537 }
2538 
2539 
2540 extern void __skb_cb_too_small_for_tcp(int, int);
2541 extern void tcpdiag_init(void);
2542 
tcp_init(void)2543 void __init tcp_init(void)
2544 {
2545 	struct sk_buff *skb = NULL;
2546 	unsigned long goal;
2547 	int order, i;
2548 
2549 	if(sizeof(struct tcp_skb_cb) > sizeof(skb->cb))
2550 		__skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb),
2551 					   sizeof(skb->cb));
2552 
2553 	tcp_openreq_cachep = kmem_cache_create("tcp_open_request",
2554 						   sizeof(struct open_request),
2555 					       0, SLAB_HWCACHE_ALIGN,
2556 					       NULL, NULL);
2557 	if(!tcp_openreq_cachep)
2558 		panic("tcp_init: Cannot alloc open_request cache.");
2559 
2560 	tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket",
2561 					      sizeof(struct tcp_bind_bucket),
2562 					      0, SLAB_HWCACHE_ALIGN,
2563 					      NULL, NULL);
2564 	if(!tcp_bucket_cachep)
2565 		panic("tcp_init: Cannot alloc tcp_bind_bucket cache.");
2566 
2567 	tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket",
2568 						sizeof(struct tcp_tw_bucket),
2569 						0, SLAB_HWCACHE_ALIGN,
2570 						NULL, NULL);
2571 	if(!tcp_timewait_cachep)
2572 		panic("tcp_init: Cannot alloc tcp_tw_bucket cache.");
2573 
2574 	/* Size and allocate the main established and bind bucket
2575 	 * hash tables.
2576 	 *
2577 	 * The methodology is similar to that of the buffer cache.
2578 	 */
2579 	if (num_physpages >= (128 * 1024))
2580 		goal = num_physpages >> (21 - PAGE_SHIFT);
2581 	else
2582 		goal = num_physpages >> (23 - PAGE_SHIFT);
2583 
2584 	for(order = 0; (1UL << order) < goal; order++)
2585 		;
2586 	do {
2587 		tcp_ehash_size = (1UL << order) * PAGE_SIZE /
2588 			sizeof(struct tcp_ehash_bucket);
2589 		tcp_ehash_size >>= 1;
2590 		while (tcp_ehash_size & (tcp_ehash_size-1))
2591 			tcp_ehash_size--;
2592 		tcp_ehash = (struct tcp_ehash_bucket *)
2593 			__get_free_pages(GFP_ATOMIC, order);
2594 	} while (tcp_ehash == NULL && --order > 0);
2595 
2596 	if (!tcp_ehash)
2597 		panic("Failed to allocate TCP established hash table\n");
2598 	for (i = 0; i < (tcp_ehash_size<<1); i++) {
2599 		tcp_ehash[i].lock = RW_LOCK_UNLOCKED;
2600 		tcp_ehash[i].chain = NULL;
2601 	}
2602 
2603 	do {
2604 		tcp_bhash_size = (1UL << order) * PAGE_SIZE /
2605 			sizeof(struct tcp_bind_hashbucket);
2606 		if ((tcp_bhash_size > (64 * 1024)) && order > 0)
2607 			continue;
2608 		tcp_bhash = (struct tcp_bind_hashbucket *)
2609 			__get_free_pages(GFP_ATOMIC, order);
2610 	} while (tcp_bhash == NULL && --order >= 0);
2611 
2612 	if (!tcp_bhash)
2613 		panic("Failed to allocate TCP bind hash table\n");
2614 	for (i = 0; i < tcp_bhash_size; i++) {
2615 		tcp_bhash[i].lock = SPIN_LOCK_UNLOCKED;
2616 		tcp_bhash[i].chain = NULL;
2617 	}
2618 
2619 	/* Try to be a bit smarter and adjust defaults depending
2620 	 * on available memory.
2621 	 */
2622 	if (order > 4) {
2623 		sysctl_local_port_range[0] = 32768;
2624 		sysctl_local_port_range[1] = 61000;
2625 		sysctl_tcp_max_tw_buckets = 180000;
2626 		sysctl_tcp_max_orphans = 4096<<(order-4);
2627 		sysctl_max_syn_backlog = 1024;
2628 	} else if (order < 3) {
2629 		sysctl_local_port_range[0] = 1024*(3-order);
2630 		sysctl_tcp_max_tw_buckets >>= (3-order);
2631 		sysctl_tcp_max_orphans >>= (3-order);
2632 		sysctl_max_syn_backlog = 128;
2633 	}
2634 	tcp_port_rover = sysctl_local_port_range[0] - 1;
2635 
2636 	sysctl_tcp_mem[0] = 768<<order;
2637 	sysctl_tcp_mem[1] = 1024<<order;
2638 	sysctl_tcp_mem[2] = 1536<<order;
2639 
2640 	if (order < 3) {
2641 		sysctl_tcp_wmem[2] = 64*1024;
2642 		sysctl_tcp_rmem[0] = PAGE_SIZE;
2643 		sysctl_tcp_rmem[1] = 43689;
2644 		sysctl_tcp_rmem[2] = 2*43689;
2645 	}
2646 
2647 	printk(KERN_INFO "TCP: Hash tables configured (established %d bind %d)\n",
2648 	       tcp_ehash_size<<1, tcp_bhash_size);
2649 
2650 	(void) tcp_mib_init();
2651 	tcpdiag_init();
2652 }
2653