1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:	$Id: sock.c,v 1.116 2001/11/08 04:20:06 davem Exp $
11  *
12  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
13  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *		Alan Cox	: 	Numerous verify_area() problems
19  *		Alan Cox	:	Connecting on a connecting socket
20  *					now returns an error for tcp.
21  *		Alan Cox	:	sock->protocol is set correctly.
22  *					and is not sometimes left as 0.
23  *		Alan Cox	:	connect handles icmp errors on a
24  *					connect properly. Unfortunately there
25  *					is a restart syscall nasty there. I
26  *					can't match BSD without hacking the C
27  *					library. Ideas urgently sought!
28  *		Alan Cox	:	Disallow bind() to addresses that are
29  *					not ours - especially broadcast ones!!
30  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32  *					instead they leave that for the DESTROY timer.
33  *		Alan Cox	:	Clean up error flag in accept
34  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35  *					was buggy. Put a remove_sock() in the handler
36  *					for memory when we hit 0. Also altered the timer
37  *					code. The ACK stuff can wait and needs major
38  *					TCP layer surgery.
39  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40  *					and fixed timer/inet_bh race.
41  *		Alan Cox	:	Added zapped flag for TCP
42  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49  *	Pauline Middelink	:	identd support
50  *		Alan Cox	:	Fixed connect() taking signals I think.
51  *		Alan Cox	:	SO_LINGER supported
52  *		Alan Cox	:	Error reporting fixes
53  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54  *		Alan Cox	:	inet sockets don't set sk->type!
55  *		Alan Cox	:	Split socket option code
56  *		Alan Cox	:	Callbacks
57  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58  *		Alex		:	Removed restriction on inet fioctl
59  *		Alan Cox	:	Splitting INET from NET core
60  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62  *		Alan Cox	:	Split IP from generic code
63  *		Alan Cox	:	New kfree_skbmem()
64  *		Alan Cox	:	Make SO_DEBUG superuser only.
65  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66  *					(compatibility fix)
67  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68  *		Alan Cox	:	Allocator for a socket is settable.
69  *		Alan Cox	:	SO_ERROR includes soft errors.
70  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71  *		Alan Cox	: 	Generic socket allocation to make hooks
72  *					easier (suggested by Craig Metz).
73  *		Michael Pall	:	SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81  *		Andi Kleen	:	Fix write_space callback
82  *		Chris Evans	:	Security fixes - signedness again
83  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *		This program is free software; you can redistribute it and/or
89  *		modify it under the terms of the GNU General Public License
90  *		as published by the Free Software Foundation; either version
91  *		2 of the License, or (at your option) any later version.
92  */
93 
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/major.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/fcntl.h>
107 #include <linux/mm.h>
108 #include <linux/slab.h>
109 #include <linux/interrupt.h>
110 #include <linux/poll.h>
111 #include <linux/tcp.h>
112 #include <linux/init.h>
113 
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116 
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/sock.h>
121 #include <linux/ipsec.h>
122 
123 #ifdef CONFIG_FILTER
124 #include <linux/filter.h>
125 #endif
126 
127 #ifdef CONFIG_INET
128 #include <net/tcp.h>
129 #endif
130 
131 /* Take into consideration the size of the struct sk_buff overhead in the
132  * determination of these values, since that is non-constant across
133  * platforms.  This makes socket queueing behavior and performance
134  * not depend upon such differences.
135  */
136 #define _SK_MEM_PACKETS		256
137 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
138 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
139 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140 
141 /* Run time adjustable parameters. */
142 __u32 sysctl_wmem_max = SK_WMEM_MAX;
143 __u32 sysctl_rmem_max = SK_RMEM_MAX;
144 __u32 sysctl_wmem_default = SK_WMEM_MAX;
145 __u32 sysctl_rmem_default = SK_RMEM_MAX;
146 
147 /* Maximal space eaten by iovec or ancilliary data plus some space */
148 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
149 
sock_set_timeout(long * timeo_p,char * optval,int optlen)150 static int sock_set_timeout(long *timeo_p, char *optval, int optlen)
151 {
152 	struct timeval tv;
153 
154 	if (optlen < sizeof(tv))
155 		return -EINVAL;
156 	if (copy_from_user(&tv, optval, sizeof(tv)))
157 		return -EFAULT;
158 
159 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
160 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
161 		return 0;
162 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
163 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
164 	return 0;
165 }
166 
167 /*
168  *	This is meant for all protocols to use and covers goings on
169  *	at the socket level. Everything here is generic.
170  */
171 
sock_setsockopt(struct socket * sock,int level,int optname,char * optval,int optlen)172 int sock_setsockopt(struct socket *sock, int level, int optname,
173 		    char *optval, int optlen)
174 {
175 	struct sock *sk=sock->sk;
176 #ifdef CONFIG_FILTER
177 	struct sk_filter *filter;
178 #endif
179 	int val;
180 	int valbool;
181 	struct linger ling;
182 	int ret = 0;
183 
184 	/*
185 	 *	Options without arguments
186 	 */
187 
188 #ifdef SO_DONTLINGER		/* Compatibility item... */
189 	switch(optname)
190 	{
191 		case SO_DONTLINGER:
192 			sk->linger=0;
193 			return 0;
194 	}
195 #endif
196 
197   	if(optlen<sizeof(int))
198   		return(-EINVAL);
199 
200 	if (get_user(val, (int *)optval))
201 		return -EFAULT;
202 
203   	valbool = val?1:0;
204 
205 	lock_sock(sk);
206 
207   	switch(optname)
208   	{
209 		case SO_DEBUG:
210 			if(val && !capable(CAP_NET_ADMIN))
211 			{
212 				ret = -EACCES;
213 			}
214 			else
215 				sk->debug=valbool;
216 			break;
217 		case SO_REUSEADDR:
218 			sk->reuse = valbool;
219 			break;
220 		case SO_TYPE:
221 		case SO_ERROR:
222 			ret = -ENOPROTOOPT;
223 		  	break;
224 		case SO_DONTROUTE:
225 			sk->localroute=valbool;
226 			break;
227 		case SO_BROADCAST:
228 			sk->broadcast=valbool;
229 			break;
230 		case SO_SNDBUF:
231 			/* Don't error on this BSD doesn't and if you think
232 			   about it this is right. Otherwise apps have to
233 			   play 'guess the biggest size' games. RCVBUF/SNDBUF
234 			   are treated in BSD as hints */
235 
236 			if (val > sysctl_wmem_max)
237 				val = sysctl_wmem_max;
238 
239 			sk->userlocks |= SOCK_SNDBUF_LOCK;
240 			if ((val * 2) < SOCK_MIN_SNDBUF)
241 				sk->sndbuf = SOCK_MIN_SNDBUF;
242 			else
243 				sk->sndbuf = (val * 2);
244 
245 			/*
246 			 *	Wake up sending tasks if we
247 			 *	upped the value.
248 			 */
249 			sk->write_space(sk);
250 			break;
251 
252 		case SO_RCVBUF:
253 			/* Don't error on this BSD doesn't and if you think
254 			   about it this is right. Otherwise apps have to
255 			   play 'guess the biggest size' games. RCVBUF/SNDBUF
256 			   are treated in BSD as hints */
257 
258 			if (val > sysctl_rmem_max)
259 				val = sysctl_rmem_max;
260 
261 			sk->userlocks |= SOCK_RCVBUF_LOCK;
262 			/* FIXME: is this lower bound the right one? */
263 			if ((val * 2) < SOCK_MIN_RCVBUF)
264 				sk->rcvbuf = SOCK_MIN_RCVBUF;
265 			else
266 				sk->rcvbuf = (val * 2);
267 			break;
268 
269 		case SO_KEEPALIVE:
270 #ifdef CONFIG_INET
271 			if (sk->protocol == IPPROTO_TCP)
272 			{
273 				tcp_set_keepalive(sk, valbool);
274 			}
275 #endif
276 			sk->keepopen = valbool;
277 			break;
278 
279 	 	case SO_OOBINLINE:
280 			sk->urginline = valbool;
281 			break;
282 
283 	 	case SO_NO_CHECK:
284 			sk->no_check = valbool;
285 			break;
286 
287 		case SO_PRIORITY:
288 			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
289 				sk->priority = val;
290 			else
291 				ret = -EPERM;
292 			break;
293 
294 		case SO_LINGER:
295 			if(optlen<sizeof(ling)) {
296 				ret = -EINVAL;	/* 1003.1g */
297 				break;
298 			}
299 			if (copy_from_user(&ling,optval,sizeof(ling))) {
300 				ret = -EFAULT;
301 				break;
302 			}
303 			if(ling.l_onoff==0) {
304 				sk->linger=0;
305 			} else {
306 #if (BITS_PER_LONG == 32)
307 				if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
308 					sk->lingertime=MAX_SCHEDULE_TIMEOUT;
309 				else
310 #endif
311 					sk->lingertime=ling.l_linger*HZ;
312 				sk->linger=1;
313 			}
314 			break;
315 
316 		case SO_BSDCOMPAT:
317 			sk->bsdism = valbool;
318 			break;
319 
320 		case SO_PASSCRED:
321 			sock->passcred = valbool;
322 			break;
323 
324 		case SO_TIMESTAMP:
325 			sk->rcvtstamp = valbool;
326 			break;
327 
328 		case SO_RCVLOWAT:
329 			if (val < 0)
330 				val = INT_MAX;
331 			sk->rcvlowat = val ? : 1;
332 			break;
333 
334 		case SO_RCVTIMEO:
335 			ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen);
336 			break;
337 
338 		case SO_SNDTIMEO:
339 			ret = sock_set_timeout(&sk->sndtimeo, optval, optlen);
340 			break;
341 
342 #ifdef CONFIG_NETDEVICES
343 		case SO_BINDTODEVICE:
344 		{
345 			char devname[IFNAMSIZ];
346 
347 			/* Sorry... */
348 			if (!capable(CAP_NET_RAW)) {
349 				ret = -EPERM;
350 				break;
351 			}
352 
353 			/* Bind this socket to a particular device like "eth0",
354 			 * as specified in the passed interface name. If the
355 			 * name is "" or the option length is zero the socket
356 			 * is not bound.
357 			 */
358 
359 			if (!valbool) {
360 				sk->bound_dev_if = 0;
361 			} else {
362 				if (optlen > IFNAMSIZ)
363 					optlen = IFNAMSIZ;
364 				if (copy_from_user(devname, optval, optlen)) {
365 					ret = -EFAULT;
366 					break;
367 				}
368 
369 				/* Remove any cached route for this socket. */
370 				sk_dst_reset(sk);
371 
372 				if (devname[0] == '\0') {
373 					sk->bound_dev_if = 0;
374 				} else {
375 					struct net_device *dev = dev_get_by_name(devname);
376 					if (!dev) {
377 						ret = -ENODEV;
378 						break;
379 					}
380 					sk->bound_dev_if = dev->ifindex;
381 					dev_put(dev);
382 				}
383 			}
384 			break;
385 		}
386 #endif
387 
388 
389 #ifdef CONFIG_FILTER
390 		case SO_ATTACH_FILTER:
391 			ret = -EINVAL;
392 			if (optlen == sizeof(struct sock_fprog)) {
393 				struct sock_fprog fprog;
394 
395 				ret = -EFAULT;
396 				if (copy_from_user(&fprog, optval, sizeof(fprog)))
397 					break;
398 
399 				ret = sk_attach_filter(&fprog, sk);
400 			}
401 			break;
402 
403 		case SO_DETACH_FILTER:
404 			spin_lock_bh(&sk->lock.slock);
405 			filter = sk->filter;
406                         if (filter) {
407 				sk->filter = NULL;
408 				spin_unlock_bh(&sk->lock.slock);
409 				sk_filter_release(sk, filter);
410 				break;
411 			}
412 			spin_unlock_bh(&sk->lock.slock);
413 			ret = -ENONET;
414 			break;
415 #endif
416 		/* We implement the SO_SNDLOWAT etc to
417 		   not be settable (1003.1g 5.3) */
418 		default:
419 		  	ret = -ENOPROTOOPT;
420 			break;
421   	}
422 	release_sock(sk);
423 	return ret;
424 }
425 
426 
sock_getsockopt(struct socket * sock,int level,int optname,char * optval,int * optlen)427 int sock_getsockopt(struct socket *sock, int level, int optname,
428 		    char *optval, int *optlen)
429 {
430 	struct sock *sk = sock->sk;
431 
432 	union
433 	{
434   		int val;
435   		struct linger ling;
436 		struct timeval tm;
437 	} v;
438 
439 	unsigned int lv=sizeof(int),len;
440 
441   	if(get_user(len,optlen))
442   		return -EFAULT;
443 	if(len < 0)
444 		return -EINVAL;
445 
446 	memset(&v, 0, sizeof(v));
447 
448   	switch(optname)
449   	{
450 		case SO_DEBUG:
451 			v.val = sk->debug;
452 			break;
453 
454 		case SO_DONTROUTE:
455 			v.val = sk->localroute;
456 			break;
457 
458 		case SO_BROADCAST:
459 			v.val= sk->broadcast;
460 			break;
461 
462 		case SO_SNDBUF:
463 			v.val=sk->sndbuf;
464 			break;
465 
466 		case SO_RCVBUF:
467 			v.val =sk->rcvbuf;
468 			break;
469 
470 		case SO_REUSEADDR:
471 			v.val = sk->reuse;
472 			break;
473 
474 		case SO_KEEPALIVE:
475 			v.val = sk->keepopen;
476 			break;
477 
478 		case SO_TYPE:
479 			v.val = sk->type;
480 			break;
481 
482 		case SO_ERROR:
483 			v.val = -sock_error(sk);
484 			if(v.val==0)
485 				v.val=xchg(&sk->err_soft,0);
486 			break;
487 
488 		case SO_OOBINLINE:
489 			v.val = sk->urginline;
490 			break;
491 
492 		case SO_NO_CHECK:
493 			v.val = sk->no_check;
494 			break;
495 
496 		case SO_PRIORITY:
497 			v.val = sk->priority;
498 			break;
499 
500 		case SO_LINGER:
501 			lv=sizeof(v.ling);
502 			v.ling.l_onoff=sk->linger;
503  			v.ling.l_linger=sk->lingertime/HZ;
504 			break;
505 
506 		case SO_BSDCOMPAT:
507 			v.val = sk->bsdism;
508 			break;
509 
510 		case SO_TIMESTAMP:
511 			v.val = sk->rcvtstamp;
512 			break;
513 
514 		case SO_RCVTIMEO:
515 			lv=sizeof(struct timeval);
516 			if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
517 				v.tm.tv_sec = 0;
518 				v.tm.tv_usec = 0;
519 			} else {
520 				v.tm.tv_sec = sk->rcvtimeo/HZ;
521 				v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000000)/HZ;
522 			}
523 			break;
524 
525 		case SO_SNDTIMEO:
526 			lv=sizeof(struct timeval);
527 			if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) {
528 				v.tm.tv_sec = 0;
529 				v.tm.tv_usec = 0;
530 			} else {
531 				v.tm.tv_sec = sk->sndtimeo/HZ;
532 				v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000000)/HZ;
533 			}
534 			break;
535 
536 		case SO_RCVLOWAT:
537 			v.val = sk->rcvlowat;
538 			break;
539 
540 		case SO_SNDLOWAT:
541 			v.val=1;
542 			break;
543 
544 		case SO_PASSCRED:
545 			v.val = sock->passcred;
546 			break;
547 
548 		case SO_PEERCRED:
549 			if (len > sizeof(sk->peercred))
550 				len = sizeof(sk->peercred);
551 			if (copy_to_user(optval, &sk->peercred, len))
552 				return -EFAULT;
553 			goto lenout;
554 
555 		case SO_PEERNAME:
556 		{
557 			char address[128];
558 
559 			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
560 				return -ENOTCONN;
561 			if (lv < len)
562 				return -EINVAL;
563 			if(copy_to_user((void*)optval, address, len))
564 				return -EFAULT;
565 			goto lenout;
566 		}
567 
568 		/* Dubious BSD thing... Probably nobody even uses it, but
569 		 * the UNIX standard wants it for whatever reason... -DaveM
570 		 */
571 		case SO_ACCEPTCONN:
572 			v.val = (sk->state == TCP_LISTEN);
573 			break;
574 
575 		default:
576 			return(-ENOPROTOOPT);
577 	}
578 	if (len > lv)
579 		len = lv;
580 	if (copy_to_user(optval, &v, len))
581 		return -EFAULT;
582 lenout:
583   	if (put_user(len, optlen))
584   		return -EFAULT;
585   	return 0;
586 }
587 
588 static kmem_cache_t *sk_cachep;
589 
590 /*
591  *	All socket objects are allocated here. This is for future
592  *	usage.
593  */
594 
sk_alloc(int family,int priority,int zero_it)595 struct sock *sk_alloc(int family, int priority, int zero_it)
596 {
597 	struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
598 
599 	if(sk && zero_it) {
600 		memset(sk, 0, sizeof(struct sock));
601 		sk->family = family;
602 		sock_lock_init(sk);
603 	}
604 
605 	return sk;
606 }
607 
sk_free(struct sock * sk)608 void sk_free(struct sock *sk)
609 {
610 #ifdef CONFIG_FILTER
611 	struct sk_filter *filter;
612 #endif
613 
614 	if (sk->destruct)
615 		sk->destruct(sk);
616 
617 #ifdef CONFIG_FILTER
618 	filter = sk->filter;
619 	if (filter) {
620 		sk_filter_release(sk, filter);
621 		sk->filter = NULL;
622 	}
623 #endif
624 
625 	if (atomic_read(&sk->omem_alloc))
626 		printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
627 
628 	kmem_cache_free(sk_cachep, sk);
629 }
630 
sk_init(void)631 void __init sk_init(void)
632 {
633 	sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
634 				      SLAB_HWCACHE_ALIGN, 0, 0);
635 	if (!sk_cachep)
636 		printk(KERN_CRIT "sk_init: Cannot create sock SLAB cache!");
637 
638 	if (num_physpages <= 4096) {
639 		sysctl_wmem_max = 32767;
640 		sysctl_rmem_max = 32767;
641 		sysctl_wmem_default = 32767;
642 		sysctl_rmem_default = 32767;
643 	} else if (num_physpages >= 131072) {
644 		sysctl_wmem_max = 131071;
645 		sysctl_rmem_max = 131071;
646 	}
647 }
648 
649 /*
650  *	Simple resource managers for sockets.
651  */
652 
653 
654 /*
655  * Write buffer destructor automatically called from kfree_skb.
656  */
sock_wfree(struct sk_buff * skb)657 void sock_wfree(struct sk_buff *skb)
658 {
659 	struct sock *sk = skb->sk;
660 
661 	/* In case it might be waiting for more memory. */
662 	atomic_sub(skb->truesize, &sk->wmem_alloc);
663 	if (!sk->use_write_queue)
664 		sk->write_space(sk);
665 	sock_put(sk);
666 }
667 
668 /*
669  * Read buffer destructor automatically called from kfree_skb.
670  */
sock_rfree(struct sk_buff * skb)671 void sock_rfree(struct sk_buff *skb)
672 {
673 	struct sock *sk = skb->sk;
674 
675 	atomic_sub(skb->truesize, &sk->rmem_alloc);
676 }
677 
678 /*
679  * Allocate a skb from the socket's send buffer.
680  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,int priority)681 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
682 {
683 	if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
684 		struct sk_buff * skb = alloc_skb(size, priority);
685 		if (skb) {
686 			skb_set_owner_w(skb, sk);
687 			return skb;
688 		}
689 	}
690 	return NULL;
691 }
692 
693 /*
694  * Allocate a skb from the socket's receive buffer.
695  */
sock_rmalloc(struct sock * sk,unsigned long size,int force,int priority)696 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
697 {
698 	if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
699 		struct sk_buff *skb = alloc_skb(size, priority);
700 		if (skb) {
701 			skb_set_owner_r(skb, sk);
702 			return skb;
703 		}
704 	}
705 	return NULL;
706 }
707 
708 /*
709  * Allocate a memory block from the socket's option memory buffer.
710  */
sock_kmalloc(struct sock * sk,int size,int priority)711 void *sock_kmalloc(struct sock *sk, int size, int priority)
712 {
713 	if ((unsigned)size <= sysctl_optmem_max &&
714 	    atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
715 		void *mem;
716 		/* First do the add, to avoid the race if kmalloc
717  		 * might sleep.
718 		 */
719 		atomic_add(size, &sk->omem_alloc);
720 		mem = kmalloc(size, priority);
721 		if (mem)
722 			return mem;
723 		atomic_sub(size, &sk->omem_alloc);
724 	}
725 	return NULL;
726 }
727 
728 /*
729  * Free an option memory block.
730  */
sock_kfree_s(struct sock * sk,void * mem,int size)731 void sock_kfree_s(struct sock *sk, void *mem, int size)
732 {
733 	kfree(mem);
734 	atomic_sub(size, &sk->omem_alloc);
735 }
736 
737 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
738    I think, these locks should be removed for datagram sockets.
739  */
sock_wait_for_wmem(struct sock * sk,long timeo)740 static long sock_wait_for_wmem(struct sock * sk, long timeo)
741 {
742 	DECLARE_WAITQUEUE(wait, current);
743 
744 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
745 	add_wait_queue(sk->sleep, &wait);
746 	for (;;) {
747 		if (!timeo)
748 			break;
749 		if (signal_pending(current))
750 			break;
751 		set_bit(SOCK_NOSPACE, &sk->socket->flags);
752 		set_current_state(TASK_INTERRUPTIBLE);
753 		if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
754 			break;
755 		if (sk->shutdown & SEND_SHUTDOWN)
756 			break;
757 		if (sk->err)
758 			break;
759 		timeo = schedule_timeout(timeo);
760 	}
761 	__set_current_state(TASK_RUNNING);
762 	remove_wait_queue(sk->sleep, &wait);
763 	return timeo;
764 }
765 
766 
767 /*
768  *	Generic send/receive buffer handlers
769  */
770 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode)771 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
772 				     unsigned long data_len, int noblock, int *errcode)
773 {
774 	struct sk_buff *skb;
775 	long timeo;
776 	int err;
777 
778 	timeo = sock_sndtimeo(sk, noblock);
779 	while (1) {
780 		err = sock_error(sk);
781 		if (err != 0)
782 			goto failure;
783 
784 		err = -EPIPE;
785 		if (sk->shutdown & SEND_SHUTDOWN)
786 			goto failure;
787 
788 		if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
789 			skb = alloc_skb(header_len, sk->allocation);
790 			if (skb) {
791 				int npages;
792 				int i;
793 
794 				/* No pages, we're done... */
795 				if (!data_len)
796 					break;
797 
798 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
799 				skb->truesize += data_len;
800 				skb_shinfo(skb)->nr_frags = npages;
801 				for (i = 0; i < npages; i++) {
802 					struct page *page;
803 					skb_frag_t *frag;
804 
805 					page = alloc_pages(sk->allocation, 0);
806 					if (!page) {
807 						err = -ENOBUFS;
808 						skb_shinfo(skb)->nr_frags = i;
809 						kfree_skb(skb);
810 						goto failure;
811 					}
812 
813 					frag = &skb_shinfo(skb)->frags[i];
814 					frag->page = page;
815 					frag->page_offset = 0;
816 					frag->size = (data_len >= PAGE_SIZE ?
817 						      PAGE_SIZE :
818 						      data_len);
819 					data_len -= PAGE_SIZE;
820 				}
821 
822 				/* Full success... */
823 				break;
824 			}
825 			err = -ENOBUFS;
826 			goto failure;
827 		}
828 		set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
829 		set_bit(SOCK_NOSPACE, &sk->socket->flags);
830 		err = -EAGAIN;
831 		if (!timeo)
832 			goto failure;
833 		if (signal_pending(current))
834 			goto interrupted;
835 		timeo = sock_wait_for_wmem(sk, timeo);
836 	}
837 
838 	skb_set_owner_w(skb, sk);
839 	return skb;
840 
841 interrupted:
842 	err = sock_intr_errno(timeo);
843 failure:
844 	*errcode = err;
845 	return NULL;
846 }
847 
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)848 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
849 				    int noblock, int *errcode)
850 {
851 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
852 }
853 
__lock_sock(struct sock * sk)854 void __lock_sock(struct sock *sk)
855 {
856 	DECLARE_WAITQUEUE(wait, current);
857 
858 	add_wait_queue_exclusive(&sk->lock.wq, &wait);
859 	for(;;) {
860 		current->state = TASK_UNINTERRUPTIBLE;
861 		spin_unlock_bh(&sk->lock.slock);
862 		schedule();
863 		spin_lock_bh(&sk->lock.slock);
864 		if(!sk->lock.users)
865 			break;
866 	}
867 	current->state = TASK_RUNNING;
868 	remove_wait_queue(&sk->lock.wq, &wait);
869 }
870 
__release_sock(struct sock * sk)871 void __release_sock(struct sock *sk)
872 {
873 	struct sk_buff *skb = sk->backlog.head;
874 
875 	do {
876 		sk->backlog.head = sk->backlog.tail = NULL;
877 		bh_unlock_sock(sk);
878 
879 		do {
880 			struct sk_buff *next = skb->next;
881 
882 			skb->next = NULL;
883 			sk->backlog_rcv(sk, skb);
884 			skb = next;
885 		} while (skb != NULL);
886 
887 		bh_lock_sock(sk);
888 	} while((skb = sk->backlog.head) != NULL);
889 }
890 
891 /*
892  *	Generic socket manager library. Most simpler socket families
893  *	use this to manage their socket lists. At some point we should
894  *	hash these. By making this generic we get the lot hashed for free.
895  *
896  *	It is broken by design. All the protocols using it must be fixed. --ANK
897  */
898 
899 rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED;
900 
sklist_remove_socket(struct sock ** list,struct sock * sk)901 void sklist_remove_socket(struct sock **list, struct sock *sk)
902 {
903 	struct sock *s;
904 
905 	write_lock_bh(&net_big_sklist_lock);
906 
907 	while ((s = *list) != NULL) {
908 		if (s == sk) {
909 			*list = s->next;
910 			break;
911 		}
912 		list = &s->next;
913 	}
914 
915 	write_unlock_bh(&net_big_sklist_lock);
916 	if (s)
917 		sock_put(s);
918 }
919 
sklist_insert_socket(struct sock ** list,struct sock * sk)920 void sklist_insert_socket(struct sock **list, struct sock *sk)
921 {
922 	write_lock_bh(&net_big_sklist_lock);
923 	sk->next= *list;
924 	*list=sk;
925 	sock_hold(sk);
926 	write_unlock_bh(&net_big_sklist_lock);
927 }
928 
929 /*
930  *	This is only called from user mode. Thus it protects itself against
931  *	interrupt users but doesn't worry about being called during work.
932  *	Once it is removed from the queue no interrupt or bottom half will
933  *	touch it and we are (fairly 8-) ) safe.
934  */
935 
936 void sklist_destroy_socket(struct sock **list, struct sock *sk);
937 
938 /*
939  *	Handler for deferred kills.
940  */
941 
sklist_destroy_timer(unsigned long data)942 static void sklist_destroy_timer(unsigned long data)
943 {
944 	struct sock *sk=(struct sock *)data;
945 	sklist_destroy_socket(NULL,sk);
946 }
947 
948 /*
949  *	Destroy a socket. We pass NULL for a list if we know the
950  *	socket is not on a list.
951  */
952 
sklist_destroy_socket(struct sock ** list,struct sock * sk)953 void sklist_destroy_socket(struct sock **list,struct sock *sk)
954 {
955 	if(list)
956 		sklist_remove_socket(list, sk);
957 
958 	skb_queue_purge(&sk->receive_queue);
959 
960 	if(atomic_read(&sk->wmem_alloc) == 0 &&
961 	   atomic_read(&sk->rmem_alloc) == 0 &&
962 	   sk->dead)
963 	{
964 		sock_put(sk);
965 	}
966 	else
967 	{
968 		/*
969 		 *	Someone is using our buffers still.. defer
970 		 */
971 		init_timer(&sk->timer);
972 		sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
973 		sk->timer.function=sklist_destroy_timer;
974 		sk->timer.data = (unsigned long)sk;
975 		add_timer(&sk->timer);
976 	}
977 }
978 
979 /*
980  * Set of default routines for initialising struct proto_ops when
981  * the protocol does not support a particular function. In certain
982  * cases where it makes no sense for a protocol to have a "do nothing"
983  * function, some default processing is provided.
984  */
985 
sock_no_release(struct socket * sock)986 int sock_no_release(struct socket *sock)
987 {
988 	return 0;
989 }
990 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)991 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
992 {
993 	return -EOPNOTSUPP;
994 }
995 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)996 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
997 		    int len, int flags)
998 {
999 	return -EOPNOTSUPP;
1000 }
1001 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)1002 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1003 {
1004 	return -EOPNOTSUPP;
1005 }
1006 
sock_no_accept(struct socket * sock,struct socket * newsock,int flags)1007 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1008 {
1009 	return -EOPNOTSUPP;
1010 }
1011 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)1012 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1013 		    int *len, int peer)
1014 {
1015 	return -EOPNOTSUPP;
1016 }
1017 
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)1018 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1019 {
1020 	return 0;
1021 }
1022 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)1023 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1024 {
1025 	return -EOPNOTSUPP;
1026 }
1027 
sock_no_listen(struct socket * sock,int backlog)1028 int sock_no_listen(struct socket *sock, int backlog)
1029 {
1030 	return -EOPNOTSUPP;
1031 }
1032 
sock_no_shutdown(struct socket * sock,int how)1033 int sock_no_shutdown(struct socket *sock, int how)
1034 {
1035 	return -EOPNOTSUPP;
1036 }
1037 
sock_no_setsockopt(struct socket * sock,int level,int optname,char * optval,int optlen)1038 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1039 		    char *optval, int optlen)
1040 {
1041 	return -EOPNOTSUPP;
1042 }
1043 
sock_no_getsockopt(struct socket * sock,int level,int optname,char * optval,int * optlen)1044 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1045 		    char *optval, int *optlen)
1046 {
1047 	return -EOPNOTSUPP;
1048 }
1049 
1050 /*
1051  * Note: if you add something that sleeps here then change sock_fcntl()
1052  *       to do proper fd locking.
1053  */
sock_no_fcntl(struct socket * sock,unsigned int cmd,unsigned long arg)1054 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
1055 {
1056 	struct sock *sk = sock->sk;
1057 
1058 	switch(cmd)
1059 	{
1060 		case F_SETOWN:
1061 			/*
1062 			 * This is a little restrictive, but it's the only
1063 			 * way to make sure that you can't send a sigurg to
1064 			 * another process.
1065 			 */
1066 			if (current->pgrp != -arg &&
1067 				current->pid != arg &&
1068 				!capable(CAP_KILL)) return(-EPERM);
1069 			sk->proc = arg;
1070 			return(0);
1071 		case F_GETOWN:
1072 			return(sk->proc);
1073 		default:
1074 			return(-EINVAL);
1075 	}
1076 }
1077 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,int flags,struct scm_cookie * scm)1078 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
1079 		    struct scm_cookie *scm)
1080 {
1081 	return -EOPNOTSUPP;
1082 }
1083 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,int len,int flags,struct scm_cookie * scm)1084 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int len, int flags,
1085 		    struct scm_cookie *scm)
1086 {
1087 	return -EOPNOTSUPP;
1088 }
1089 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1090 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1091 {
1092 	/* Mirror missing mmap method error code */
1093 	return -ENODEV;
1094 }
1095 
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)1096 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1097 {
1098 	ssize_t res;
1099 	struct msghdr msg;
1100 	struct iovec iov;
1101 	mm_segment_t old_fs;
1102 	char *kaddr;
1103 
1104 	kaddr = kmap(page);
1105 
1106 	msg.msg_name = NULL;
1107 	msg.msg_namelen = 0;
1108 	msg.msg_iov = &iov;
1109 	msg.msg_iovlen = 1;
1110 	msg.msg_control = NULL;
1111 	msg.msg_controllen = 0;
1112 	msg.msg_flags = flags;
1113 
1114 	iov.iov_base = kaddr + offset;
1115 	iov.iov_len = size;
1116 
1117 	old_fs = get_fs();
1118 	set_fs(KERNEL_DS);
1119 	res = sock_sendmsg(sock, &msg, size);
1120 	set_fs(old_fs);
1121 
1122 	kunmap(page);
1123 	return res;
1124 }
1125 
1126 /*
1127  *	Default Socket Callbacks
1128  */
1129 
sock_def_wakeup(struct sock * sk)1130 void sock_def_wakeup(struct sock *sk)
1131 {
1132 	read_lock(&sk->callback_lock);
1133 	if (sk->sleep && waitqueue_active(sk->sleep))
1134 		wake_up_interruptible_all(sk->sleep);
1135 	read_unlock(&sk->callback_lock);
1136 }
1137 
sock_def_error_report(struct sock * sk)1138 void sock_def_error_report(struct sock *sk)
1139 {
1140 	read_lock(&sk->callback_lock);
1141 	if (sk->sleep && waitqueue_active(sk->sleep))
1142 		wake_up_interruptible(sk->sleep);
1143 	sk_wake_async(sk,0,POLL_ERR);
1144 	read_unlock(&sk->callback_lock);
1145 }
1146 
sock_def_readable(struct sock * sk,int len)1147 void sock_def_readable(struct sock *sk, int len)
1148 {
1149 	read_lock(&sk->callback_lock);
1150 	if (sk->sleep && waitqueue_active(sk->sleep))
1151 		wake_up_interruptible(sk->sleep);
1152 	sk_wake_async(sk,1,POLL_IN);
1153 	read_unlock(&sk->callback_lock);
1154 }
1155 
sock_def_write_space(struct sock * sk)1156 void sock_def_write_space(struct sock *sk)
1157 {
1158 	read_lock(&sk->callback_lock);
1159 
1160 	/* Do not wake up a writer until he can make "significant"
1161 	 * progress.  --DaveM
1162 	 */
1163 	if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
1164 		if (sk->sleep && waitqueue_active(sk->sleep))
1165 			wake_up_interruptible(sk->sleep);
1166 
1167 		/* Should agree with poll, otherwise some programs break */
1168 		if (sock_writeable(sk))
1169 			sk_wake_async(sk, 2, POLL_OUT);
1170 	}
1171 
1172 	read_unlock(&sk->callback_lock);
1173 }
1174 
sock_def_destruct(struct sock * sk)1175 void sock_def_destruct(struct sock *sk)
1176 {
1177 	if (sk->protinfo.destruct_hook)
1178 		kfree(sk->protinfo.destruct_hook);
1179 }
1180 
sock_init_data(struct socket * sock,struct sock * sk)1181 void sock_init_data(struct socket *sock, struct sock *sk)
1182 {
1183 	skb_queue_head_init(&sk->receive_queue);
1184 	skb_queue_head_init(&sk->write_queue);
1185 	skb_queue_head_init(&sk->error_queue);
1186 
1187 	init_timer(&sk->timer);
1188 
1189 	sk->allocation	=	GFP_KERNEL;
1190 	sk->rcvbuf	=	sysctl_rmem_default;
1191 	sk->sndbuf	=	sysctl_wmem_default;
1192 	sk->state 	= 	TCP_CLOSE;
1193 	sk->zapped	=	1;
1194 	sk->socket	=	sock;
1195 
1196 	if(sock)
1197 	{
1198 		sk->type	=	sock->type;
1199 		sk->sleep	=	&sock->wait;
1200 		sock->sk	=	sk;
1201 	} else
1202 		sk->sleep	=	NULL;
1203 
1204 	sk->dst_lock		=	RW_LOCK_UNLOCKED;
1205 	sk->callback_lock	=	RW_LOCK_UNLOCKED;
1206 
1207 	sk->state_change	=	sock_def_wakeup;
1208 	sk->data_ready		=	sock_def_readable;
1209 	sk->write_space		=	sock_def_write_space;
1210 	sk->error_report	=	sock_def_error_report;
1211 	sk->destruct            =       sock_def_destruct;
1212 
1213 	sk->peercred.pid 	=	0;
1214 	sk->peercred.uid	=	-1;
1215 	sk->peercred.gid	=	-1;
1216 	sk->rcvlowat		=	1;
1217 	sk->rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1218 	sk->sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1219 
1220 	atomic_set(&sk->refcnt, 1);
1221 }
1222