1 /*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * Generic socket support routines. Memory allocators, socket lock/release
7 * handler for protocols to use and generic option handler.
8 *
9 *
10 * Version: $Id: sock.c,v 1.116 2001/11/08 04:20:06 davem Exp $
11 *
12 * Authors: Ross Biro, <bir7@leland.Stanford.Edu>
13 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14 * Florian La Roche, <flla@stud.uni-sb.de>
15 * Alan Cox, <A.Cox@swansea.ac.uk>
16 *
17 * Fixes:
18 * Alan Cox : Numerous verify_area() problems
19 * Alan Cox : Connecting on a connecting socket
20 * now returns an error for tcp.
21 * Alan Cox : sock->protocol is set correctly.
22 * and is not sometimes left as 0.
23 * Alan Cox : connect handles icmp errors on a
24 * connect properly. Unfortunately there
25 * is a restart syscall nasty there. I
26 * can't match BSD without hacking the C
27 * library. Ideas urgently sought!
28 * Alan Cox : Disallow bind() to addresses that are
29 * not ours - especially broadcast ones!!
30 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
31 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
32 * instead they leave that for the DESTROY timer.
33 * Alan Cox : Clean up error flag in accept
34 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
35 * was buggy. Put a remove_sock() in the handler
36 * for memory when we hit 0. Also altered the timer
37 * code. The ACK stuff can wait and needs major
38 * TCP layer surgery.
39 * Alan Cox : Fixed TCP ack bug, removed remove sock
40 * and fixed timer/inet_bh race.
41 * Alan Cox : Added zapped flag for TCP
42 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
43 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
45 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
46 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47 * Rick Sladkey : Relaxed UDP rules for matching packets.
48 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
49 * Pauline Middelink : identd support
50 * Alan Cox : Fixed connect() taking signals I think.
51 * Alan Cox : SO_LINGER supported
52 * Alan Cox : Error reporting fixes
53 * Anonymous : inet_create tidied up (sk->reuse setting)
54 * Alan Cox : inet sockets don't set sk->type!
55 * Alan Cox : Split socket option code
56 * Alan Cox : Callbacks
57 * Alan Cox : Nagle flag for Charles & Johannes stuff
58 * Alex : Removed restriction on inet fioctl
59 * Alan Cox : Splitting INET from NET core
60 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
61 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
62 * Alan Cox : Split IP from generic code
63 * Alan Cox : New kfree_skbmem()
64 * Alan Cox : Make SO_DEBUG superuser only.
65 * Alan Cox : Allow anyone to clear SO_DEBUG
66 * (compatibility fix)
67 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
68 * Alan Cox : Allocator for a socket is settable.
69 * Alan Cox : SO_ERROR includes soft errors.
70 * Alan Cox : Allow NULL arguments on some SO_ opts
71 * Alan Cox : Generic socket allocation to make hooks
72 * easier (suggested by Craig Metz).
73 * Michael Pall : SO_ERROR returns positive errno again
74 * Steve Whitehouse: Added default destructor to free
75 * protocol private data.
76 * Steve Whitehouse: Added various other default routines
77 * common to several socket families.
78 * Chris Evans : Call suser() check last on F_SETOWN
79 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
81 * Andi Kleen : Fix write_space callback
82 * Chris Evans : Security fixes - signedness again
83 * Arnaldo C. Melo : cleanups, use skb_queue_purge
84 *
85 * To Fix:
86 *
87 *
88 * This program is free software; you can redistribute it and/or
89 * modify it under the terms of the GNU General Public License
90 * as published by the Free Software Foundation; either version
91 * 2 of the License, or (at your option) any later version.
92 */
93
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/major.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/fcntl.h>
107 #include <linux/mm.h>
108 #include <linux/slab.h>
109 #include <linux/interrupt.h>
110 #include <linux/poll.h>
111 #include <linux/tcp.h>
112 #include <linux/init.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/sock.h>
121 #include <linux/ipsec.h>
122
123 #ifdef CONFIG_FILTER
124 #include <linux/filter.h>
125 #endif
126
127 #ifdef CONFIG_INET
128 #include <net/tcp.h>
129 #endif
130
131 /* Take into consideration the size of the struct sk_buff overhead in the
132 * determination of these values, since that is non-constant across
133 * platforms. This makes socket queueing behavior and performance
134 * not depend upon such differences.
135 */
136 #define _SK_MEM_PACKETS 256
137 #define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
138 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
139 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140
141 /* Run time adjustable parameters. */
142 __u32 sysctl_wmem_max = SK_WMEM_MAX;
143 __u32 sysctl_rmem_max = SK_RMEM_MAX;
144 __u32 sysctl_wmem_default = SK_WMEM_MAX;
145 __u32 sysctl_rmem_default = SK_RMEM_MAX;
146
147 /* Maximal space eaten by iovec or ancilliary data plus some space */
148 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
149
sock_set_timeout(long * timeo_p,char * optval,int optlen)150 static int sock_set_timeout(long *timeo_p, char *optval, int optlen)
151 {
152 struct timeval tv;
153
154 if (optlen < sizeof(tv))
155 return -EINVAL;
156 if (copy_from_user(&tv, optval, sizeof(tv)))
157 return -EFAULT;
158
159 *timeo_p = MAX_SCHEDULE_TIMEOUT;
160 if (tv.tv_sec == 0 && tv.tv_usec == 0)
161 return 0;
162 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
163 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
164 return 0;
165 }
166
167 /*
168 * This is meant for all protocols to use and covers goings on
169 * at the socket level. Everything here is generic.
170 */
171
sock_setsockopt(struct socket * sock,int level,int optname,char * optval,int optlen)172 int sock_setsockopt(struct socket *sock, int level, int optname,
173 char *optval, int optlen)
174 {
175 struct sock *sk=sock->sk;
176 #ifdef CONFIG_FILTER
177 struct sk_filter *filter;
178 #endif
179 int val;
180 int valbool;
181 struct linger ling;
182 int ret = 0;
183
184 /*
185 * Options without arguments
186 */
187
188 #ifdef SO_DONTLINGER /* Compatibility item... */
189 switch(optname)
190 {
191 case SO_DONTLINGER:
192 sk->linger=0;
193 return 0;
194 }
195 #endif
196
197 if(optlen<sizeof(int))
198 return(-EINVAL);
199
200 if (get_user(val, (int *)optval))
201 return -EFAULT;
202
203 valbool = val?1:0;
204
205 lock_sock(sk);
206
207 switch(optname)
208 {
209 case SO_DEBUG:
210 if(val && !capable(CAP_NET_ADMIN))
211 {
212 ret = -EACCES;
213 }
214 else
215 sk->debug=valbool;
216 break;
217 case SO_REUSEADDR:
218 sk->reuse = valbool;
219 break;
220 case SO_TYPE:
221 case SO_ERROR:
222 ret = -ENOPROTOOPT;
223 break;
224 case SO_DONTROUTE:
225 sk->localroute=valbool;
226 break;
227 case SO_BROADCAST:
228 sk->broadcast=valbool;
229 break;
230 case SO_SNDBUF:
231 /* Don't error on this BSD doesn't and if you think
232 about it this is right. Otherwise apps have to
233 play 'guess the biggest size' games. RCVBUF/SNDBUF
234 are treated in BSD as hints */
235
236 if (val > sysctl_wmem_max)
237 val = sysctl_wmem_max;
238
239 sk->userlocks |= SOCK_SNDBUF_LOCK;
240 if ((val * 2) < SOCK_MIN_SNDBUF)
241 sk->sndbuf = SOCK_MIN_SNDBUF;
242 else
243 sk->sndbuf = (val * 2);
244
245 /*
246 * Wake up sending tasks if we
247 * upped the value.
248 */
249 sk->write_space(sk);
250 break;
251
252 case SO_RCVBUF:
253 /* Don't error on this BSD doesn't and if you think
254 about it this is right. Otherwise apps have to
255 play 'guess the biggest size' games. RCVBUF/SNDBUF
256 are treated in BSD as hints */
257
258 if (val > sysctl_rmem_max)
259 val = sysctl_rmem_max;
260
261 sk->userlocks |= SOCK_RCVBUF_LOCK;
262 /* FIXME: is this lower bound the right one? */
263 if ((val * 2) < SOCK_MIN_RCVBUF)
264 sk->rcvbuf = SOCK_MIN_RCVBUF;
265 else
266 sk->rcvbuf = (val * 2);
267 break;
268
269 case SO_KEEPALIVE:
270 #ifdef CONFIG_INET
271 if (sk->protocol == IPPROTO_TCP)
272 {
273 tcp_set_keepalive(sk, valbool);
274 }
275 #endif
276 sk->keepopen = valbool;
277 break;
278
279 case SO_OOBINLINE:
280 sk->urginline = valbool;
281 break;
282
283 case SO_NO_CHECK:
284 sk->no_check = valbool;
285 break;
286
287 case SO_PRIORITY:
288 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
289 sk->priority = val;
290 else
291 ret = -EPERM;
292 break;
293
294 case SO_LINGER:
295 if(optlen<sizeof(ling)) {
296 ret = -EINVAL; /* 1003.1g */
297 break;
298 }
299 if (copy_from_user(&ling,optval,sizeof(ling))) {
300 ret = -EFAULT;
301 break;
302 }
303 if(ling.l_onoff==0) {
304 sk->linger=0;
305 } else {
306 #if (BITS_PER_LONG == 32)
307 if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
308 sk->lingertime=MAX_SCHEDULE_TIMEOUT;
309 else
310 #endif
311 sk->lingertime=ling.l_linger*HZ;
312 sk->linger=1;
313 }
314 break;
315
316 case SO_BSDCOMPAT:
317 sk->bsdism = valbool;
318 break;
319
320 case SO_PASSCRED:
321 sock->passcred = valbool;
322 break;
323
324 case SO_TIMESTAMP:
325 sk->rcvtstamp = valbool;
326 break;
327
328 case SO_RCVLOWAT:
329 if (val < 0)
330 val = INT_MAX;
331 sk->rcvlowat = val ? : 1;
332 break;
333
334 case SO_RCVTIMEO:
335 ret = sock_set_timeout(&sk->rcvtimeo, optval, optlen);
336 break;
337
338 case SO_SNDTIMEO:
339 ret = sock_set_timeout(&sk->sndtimeo, optval, optlen);
340 break;
341
342 #ifdef CONFIG_NETDEVICES
343 case SO_BINDTODEVICE:
344 {
345 char devname[IFNAMSIZ];
346
347 /* Sorry... */
348 if (!capable(CAP_NET_RAW)) {
349 ret = -EPERM;
350 break;
351 }
352
353 /* Bind this socket to a particular device like "eth0",
354 * as specified in the passed interface name. If the
355 * name is "" or the option length is zero the socket
356 * is not bound.
357 */
358
359 if (!valbool) {
360 sk->bound_dev_if = 0;
361 } else {
362 if (optlen > IFNAMSIZ)
363 optlen = IFNAMSIZ;
364 if (copy_from_user(devname, optval, optlen)) {
365 ret = -EFAULT;
366 break;
367 }
368
369 /* Remove any cached route for this socket. */
370 sk_dst_reset(sk);
371
372 if (devname[0] == '\0') {
373 sk->bound_dev_if = 0;
374 } else {
375 struct net_device *dev = dev_get_by_name(devname);
376 if (!dev) {
377 ret = -ENODEV;
378 break;
379 }
380 sk->bound_dev_if = dev->ifindex;
381 dev_put(dev);
382 }
383 }
384 break;
385 }
386 #endif
387
388
389 #ifdef CONFIG_FILTER
390 case SO_ATTACH_FILTER:
391 ret = -EINVAL;
392 if (optlen == sizeof(struct sock_fprog)) {
393 struct sock_fprog fprog;
394
395 ret = -EFAULT;
396 if (copy_from_user(&fprog, optval, sizeof(fprog)))
397 break;
398
399 ret = sk_attach_filter(&fprog, sk);
400 }
401 break;
402
403 case SO_DETACH_FILTER:
404 spin_lock_bh(&sk->lock.slock);
405 filter = sk->filter;
406 if (filter) {
407 sk->filter = NULL;
408 spin_unlock_bh(&sk->lock.slock);
409 sk_filter_release(sk, filter);
410 break;
411 }
412 spin_unlock_bh(&sk->lock.slock);
413 ret = -ENONET;
414 break;
415 #endif
416 /* We implement the SO_SNDLOWAT etc to
417 not be settable (1003.1g 5.3) */
418 default:
419 ret = -ENOPROTOOPT;
420 break;
421 }
422 release_sock(sk);
423 return ret;
424 }
425
426
sock_getsockopt(struct socket * sock,int level,int optname,char * optval,int * optlen)427 int sock_getsockopt(struct socket *sock, int level, int optname,
428 char *optval, int *optlen)
429 {
430 struct sock *sk = sock->sk;
431
432 union
433 {
434 int val;
435 struct linger ling;
436 struct timeval tm;
437 } v;
438
439 unsigned int lv=sizeof(int),len;
440
441 if(get_user(len,optlen))
442 return -EFAULT;
443 if(len < 0)
444 return -EINVAL;
445
446 memset(&v, 0, sizeof(v));
447
448 switch(optname)
449 {
450 case SO_DEBUG:
451 v.val = sk->debug;
452 break;
453
454 case SO_DONTROUTE:
455 v.val = sk->localroute;
456 break;
457
458 case SO_BROADCAST:
459 v.val= sk->broadcast;
460 break;
461
462 case SO_SNDBUF:
463 v.val=sk->sndbuf;
464 break;
465
466 case SO_RCVBUF:
467 v.val =sk->rcvbuf;
468 break;
469
470 case SO_REUSEADDR:
471 v.val = sk->reuse;
472 break;
473
474 case SO_KEEPALIVE:
475 v.val = sk->keepopen;
476 break;
477
478 case SO_TYPE:
479 v.val = sk->type;
480 break;
481
482 case SO_ERROR:
483 v.val = -sock_error(sk);
484 if(v.val==0)
485 v.val=xchg(&sk->err_soft,0);
486 break;
487
488 case SO_OOBINLINE:
489 v.val = sk->urginline;
490 break;
491
492 case SO_NO_CHECK:
493 v.val = sk->no_check;
494 break;
495
496 case SO_PRIORITY:
497 v.val = sk->priority;
498 break;
499
500 case SO_LINGER:
501 lv=sizeof(v.ling);
502 v.ling.l_onoff=sk->linger;
503 v.ling.l_linger=sk->lingertime/HZ;
504 break;
505
506 case SO_BSDCOMPAT:
507 v.val = sk->bsdism;
508 break;
509
510 case SO_TIMESTAMP:
511 v.val = sk->rcvtstamp;
512 break;
513
514 case SO_RCVTIMEO:
515 lv=sizeof(struct timeval);
516 if (sk->rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
517 v.tm.tv_sec = 0;
518 v.tm.tv_usec = 0;
519 } else {
520 v.tm.tv_sec = sk->rcvtimeo/HZ;
521 v.tm.tv_usec = ((sk->rcvtimeo%HZ)*1000000)/HZ;
522 }
523 break;
524
525 case SO_SNDTIMEO:
526 lv=sizeof(struct timeval);
527 if (sk->sndtimeo == MAX_SCHEDULE_TIMEOUT) {
528 v.tm.tv_sec = 0;
529 v.tm.tv_usec = 0;
530 } else {
531 v.tm.tv_sec = sk->sndtimeo/HZ;
532 v.tm.tv_usec = ((sk->sndtimeo%HZ)*1000000)/HZ;
533 }
534 break;
535
536 case SO_RCVLOWAT:
537 v.val = sk->rcvlowat;
538 break;
539
540 case SO_SNDLOWAT:
541 v.val=1;
542 break;
543
544 case SO_PASSCRED:
545 v.val = sock->passcred;
546 break;
547
548 case SO_PEERCRED:
549 if (len > sizeof(sk->peercred))
550 len = sizeof(sk->peercred);
551 if (copy_to_user(optval, &sk->peercred, len))
552 return -EFAULT;
553 goto lenout;
554
555 case SO_PEERNAME:
556 {
557 char address[128];
558
559 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
560 return -ENOTCONN;
561 if (lv < len)
562 return -EINVAL;
563 if(copy_to_user((void*)optval, address, len))
564 return -EFAULT;
565 goto lenout;
566 }
567
568 /* Dubious BSD thing... Probably nobody even uses it, but
569 * the UNIX standard wants it for whatever reason... -DaveM
570 */
571 case SO_ACCEPTCONN:
572 v.val = (sk->state == TCP_LISTEN);
573 break;
574
575 default:
576 return(-ENOPROTOOPT);
577 }
578 if (len > lv)
579 len = lv;
580 if (copy_to_user(optval, &v, len))
581 return -EFAULT;
582 lenout:
583 if (put_user(len, optlen))
584 return -EFAULT;
585 return 0;
586 }
587
588 static kmem_cache_t *sk_cachep;
589
590 /*
591 * All socket objects are allocated here. This is for future
592 * usage.
593 */
594
sk_alloc(int family,int priority,int zero_it)595 struct sock *sk_alloc(int family, int priority, int zero_it)
596 {
597 struct sock *sk = kmem_cache_alloc(sk_cachep, priority);
598
599 if(sk && zero_it) {
600 memset(sk, 0, sizeof(struct sock));
601 sk->family = family;
602 sock_lock_init(sk);
603 }
604
605 return sk;
606 }
607
sk_free(struct sock * sk)608 void sk_free(struct sock *sk)
609 {
610 #ifdef CONFIG_FILTER
611 struct sk_filter *filter;
612 #endif
613
614 if (sk->destruct)
615 sk->destruct(sk);
616
617 #ifdef CONFIG_FILTER
618 filter = sk->filter;
619 if (filter) {
620 sk_filter_release(sk, filter);
621 sk->filter = NULL;
622 }
623 #endif
624
625 if (atomic_read(&sk->omem_alloc))
626 printk(KERN_DEBUG "sk_free: optmem leakage (%d bytes) detected.\n", atomic_read(&sk->omem_alloc));
627
628 kmem_cache_free(sk_cachep, sk);
629 }
630
sk_init(void)631 void __init sk_init(void)
632 {
633 sk_cachep = kmem_cache_create("sock", sizeof(struct sock), 0,
634 SLAB_HWCACHE_ALIGN, 0, 0);
635 if (!sk_cachep)
636 printk(KERN_CRIT "sk_init: Cannot create sock SLAB cache!");
637
638 if (num_physpages <= 4096) {
639 sysctl_wmem_max = 32767;
640 sysctl_rmem_max = 32767;
641 sysctl_wmem_default = 32767;
642 sysctl_rmem_default = 32767;
643 } else if (num_physpages >= 131072) {
644 sysctl_wmem_max = 131071;
645 sysctl_rmem_max = 131071;
646 }
647 }
648
649 /*
650 * Simple resource managers for sockets.
651 */
652
653
654 /*
655 * Write buffer destructor automatically called from kfree_skb.
656 */
sock_wfree(struct sk_buff * skb)657 void sock_wfree(struct sk_buff *skb)
658 {
659 struct sock *sk = skb->sk;
660
661 /* In case it might be waiting for more memory. */
662 atomic_sub(skb->truesize, &sk->wmem_alloc);
663 if (!sk->use_write_queue)
664 sk->write_space(sk);
665 sock_put(sk);
666 }
667
668 /*
669 * Read buffer destructor automatically called from kfree_skb.
670 */
sock_rfree(struct sk_buff * skb)671 void sock_rfree(struct sk_buff *skb)
672 {
673 struct sock *sk = skb->sk;
674
675 atomic_sub(skb->truesize, &sk->rmem_alloc);
676 }
677
678 /*
679 * Allocate a skb from the socket's send buffer.
680 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,int priority)681 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
682 {
683 if (force || atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
684 struct sk_buff * skb = alloc_skb(size, priority);
685 if (skb) {
686 skb_set_owner_w(skb, sk);
687 return skb;
688 }
689 }
690 return NULL;
691 }
692
693 /*
694 * Allocate a skb from the socket's receive buffer.
695 */
sock_rmalloc(struct sock * sk,unsigned long size,int force,int priority)696 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
697 {
698 if (force || atomic_read(&sk->rmem_alloc) < sk->rcvbuf) {
699 struct sk_buff *skb = alloc_skb(size, priority);
700 if (skb) {
701 skb_set_owner_r(skb, sk);
702 return skb;
703 }
704 }
705 return NULL;
706 }
707
708 /*
709 * Allocate a memory block from the socket's option memory buffer.
710 */
sock_kmalloc(struct sock * sk,int size,int priority)711 void *sock_kmalloc(struct sock *sk, int size, int priority)
712 {
713 if ((unsigned)size <= sysctl_optmem_max &&
714 atomic_read(&sk->omem_alloc)+size < sysctl_optmem_max) {
715 void *mem;
716 /* First do the add, to avoid the race if kmalloc
717 * might sleep.
718 */
719 atomic_add(size, &sk->omem_alloc);
720 mem = kmalloc(size, priority);
721 if (mem)
722 return mem;
723 atomic_sub(size, &sk->omem_alloc);
724 }
725 return NULL;
726 }
727
728 /*
729 * Free an option memory block.
730 */
sock_kfree_s(struct sock * sk,void * mem,int size)731 void sock_kfree_s(struct sock *sk, void *mem, int size)
732 {
733 kfree(mem);
734 atomic_sub(size, &sk->omem_alloc);
735 }
736
737 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
738 I think, these locks should be removed for datagram sockets.
739 */
sock_wait_for_wmem(struct sock * sk,long timeo)740 static long sock_wait_for_wmem(struct sock * sk, long timeo)
741 {
742 DECLARE_WAITQUEUE(wait, current);
743
744 clear_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
745 add_wait_queue(sk->sleep, &wait);
746 for (;;) {
747 if (!timeo)
748 break;
749 if (signal_pending(current))
750 break;
751 set_bit(SOCK_NOSPACE, &sk->socket->flags);
752 set_current_state(TASK_INTERRUPTIBLE);
753 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf)
754 break;
755 if (sk->shutdown & SEND_SHUTDOWN)
756 break;
757 if (sk->err)
758 break;
759 timeo = schedule_timeout(timeo);
760 }
761 __set_current_state(TASK_RUNNING);
762 remove_wait_queue(sk->sleep, &wait);
763 return timeo;
764 }
765
766
767 /*
768 * Generic send/receive buffer handlers
769 */
770
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode)771 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
772 unsigned long data_len, int noblock, int *errcode)
773 {
774 struct sk_buff *skb;
775 long timeo;
776 int err;
777
778 timeo = sock_sndtimeo(sk, noblock);
779 while (1) {
780 err = sock_error(sk);
781 if (err != 0)
782 goto failure;
783
784 err = -EPIPE;
785 if (sk->shutdown & SEND_SHUTDOWN)
786 goto failure;
787
788 if (atomic_read(&sk->wmem_alloc) < sk->sndbuf) {
789 skb = alloc_skb(header_len, sk->allocation);
790 if (skb) {
791 int npages;
792 int i;
793
794 /* No pages, we're done... */
795 if (!data_len)
796 break;
797
798 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
799 skb->truesize += data_len;
800 skb_shinfo(skb)->nr_frags = npages;
801 for (i = 0; i < npages; i++) {
802 struct page *page;
803 skb_frag_t *frag;
804
805 page = alloc_pages(sk->allocation, 0);
806 if (!page) {
807 err = -ENOBUFS;
808 skb_shinfo(skb)->nr_frags = i;
809 kfree_skb(skb);
810 goto failure;
811 }
812
813 frag = &skb_shinfo(skb)->frags[i];
814 frag->page = page;
815 frag->page_offset = 0;
816 frag->size = (data_len >= PAGE_SIZE ?
817 PAGE_SIZE :
818 data_len);
819 data_len -= PAGE_SIZE;
820 }
821
822 /* Full success... */
823 break;
824 }
825 err = -ENOBUFS;
826 goto failure;
827 }
828 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
829 set_bit(SOCK_NOSPACE, &sk->socket->flags);
830 err = -EAGAIN;
831 if (!timeo)
832 goto failure;
833 if (signal_pending(current))
834 goto interrupted;
835 timeo = sock_wait_for_wmem(sk, timeo);
836 }
837
838 skb_set_owner_w(skb, sk);
839 return skb;
840
841 interrupted:
842 err = sock_intr_errno(timeo);
843 failure:
844 *errcode = err;
845 return NULL;
846 }
847
sock_alloc_send_skb(struct sock * sk,unsigned long size,int noblock,int * errcode)848 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
849 int noblock, int *errcode)
850 {
851 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
852 }
853
__lock_sock(struct sock * sk)854 void __lock_sock(struct sock *sk)
855 {
856 DECLARE_WAITQUEUE(wait, current);
857
858 add_wait_queue_exclusive(&sk->lock.wq, &wait);
859 for(;;) {
860 current->state = TASK_UNINTERRUPTIBLE;
861 spin_unlock_bh(&sk->lock.slock);
862 schedule();
863 spin_lock_bh(&sk->lock.slock);
864 if(!sk->lock.users)
865 break;
866 }
867 current->state = TASK_RUNNING;
868 remove_wait_queue(&sk->lock.wq, &wait);
869 }
870
__release_sock(struct sock * sk)871 void __release_sock(struct sock *sk)
872 {
873 struct sk_buff *skb = sk->backlog.head;
874
875 do {
876 sk->backlog.head = sk->backlog.tail = NULL;
877 bh_unlock_sock(sk);
878
879 do {
880 struct sk_buff *next = skb->next;
881
882 skb->next = NULL;
883 sk->backlog_rcv(sk, skb);
884 skb = next;
885 } while (skb != NULL);
886
887 bh_lock_sock(sk);
888 } while((skb = sk->backlog.head) != NULL);
889 }
890
891 /*
892 * Generic socket manager library. Most simpler socket families
893 * use this to manage their socket lists. At some point we should
894 * hash these. By making this generic we get the lot hashed for free.
895 *
896 * It is broken by design. All the protocols using it must be fixed. --ANK
897 */
898
899 rwlock_t net_big_sklist_lock = RW_LOCK_UNLOCKED;
900
sklist_remove_socket(struct sock ** list,struct sock * sk)901 void sklist_remove_socket(struct sock **list, struct sock *sk)
902 {
903 struct sock *s;
904
905 write_lock_bh(&net_big_sklist_lock);
906
907 while ((s = *list) != NULL) {
908 if (s == sk) {
909 *list = s->next;
910 break;
911 }
912 list = &s->next;
913 }
914
915 write_unlock_bh(&net_big_sklist_lock);
916 if (s)
917 sock_put(s);
918 }
919
sklist_insert_socket(struct sock ** list,struct sock * sk)920 void sklist_insert_socket(struct sock **list, struct sock *sk)
921 {
922 write_lock_bh(&net_big_sklist_lock);
923 sk->next= *list;
924 *list=sk;
925 sock_hold(sk);
926 write_unlock_bh(&net_big_sklist_lock);
927 }
928
929 /*
930 * This is only called from user mode. Thus it protects itself against
931 * interrupt users but doesn't worry about being called during work.
932 * Once it is removed from the queue no interrupt or bottom half will
933 * touch it and we are (fairly 8-) ) safe.
934 */
935
936 void sklist_destroy_socket(struct sock **list, struct sock *sk);
937
938 /*
939 * Handler for deferred kills.
940 */
941
sklist_destroy_timer(unsigned long data)942 static void sklist_destroy_timer(unsigned long data)
943 {
944 struct sock *sk=(struct sock *)data;
945 sklist_destroy_socket(NULL,sk);
946 }
947
948 /*
949 * Destroy a socket. We pass NULL for a list if we know the
950 * socket is not on a list.
951 */
952
sklist_destroy_socket(struct sock ** list,struct sock * sk)953 void sklist_destroy_socket(struct sock **list,struct sock *sk)
954 {
955 if(list)
956 sklist_remove_socket(list, sk);
957
958 skb_queue_purge(&sk->receive_queue);
959
960 if(atomic_read(&sk->wmem_alloc) == 0 &&
961 atomic_read(&sk->rmem_alloc) == 0 &&
962 sk->dead)
963 {
964 sock_put(sk);
965 }
966 else
967 {
968 /*
969 * Someone is using our buffers still.. defer
970 */
971 init_timer(&sk->timer);
972 sk->timer.expires=jiffies+SOCK_DESTROY_TIME;
973 sk->timer.function=sklist_destroy_timer;
974 sk->timer.data = (unsigned long)sk;
975 add_timer(&sk->timer);
976 }
977 }
978
979 /*
980 * Set of default routines for initialising struct proto_ops when
981 * the protocol does not support a particular function. In certain
982 * cases where it makes no sense for a protocol to have a "do nothing"
983 * function, some default processing is provided.
984 */
985
sock_no_release(struct socket * sock)986 int sock_no_release(struct socket *sock)
987 {
988 return 0;
989 }
990
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)991 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
992 {
993 return -EOPNOTSUPP;
994 }
995
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)996 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
997 int len, int flags)
998 {
999 return -EOPNOTSUPP;
1000 }
1001
sock_no_socketpair(struct socket * sock1,struct socket * sock2)1002 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1003 {
1004 return -EOPNOTSUPP;
1005 }
1006
sock_no_accept(struct socket * sock,struct socket * newsock,int flags)1007 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1008 {
1009 return -EOPNOTSUPP;
1010 }
1011
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int * len,int peer)1012 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1013 int *len, int peer)
1014 {
1015 return -EOPNOTSUPP;
1016 }
1017
sock_no_poll(struct file * file,struct socket * sock,poll_table * pt)1018 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1019 {
1020 return 0;
1021 }
1022
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)1023 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1024 {
1025 return -EOPNOTSUPP;
1026 }
1027
sock_no_listen(struct socket * sock,int backlog)1028 int sock_no_listen(struct socket *sock, int backlog)
1029 {
1030 return -EOPNOTSUPP;
1031 }
1032
sock_no_shutdown(struct socket * sock,int how)1033 int sock_no_shutdown(struct socket *sock, int how)
1034 {
1035 return -EOPNOTSUPP;
1036 }
1037
sock_no_setsockopt(struct socket * sock,int level,int optname,char * optval,int optlen)1038 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1039 char *optval, int optlen)
1040 {
1041 return -EOPNOTSUPP;
1042 }
1043
sock_no_getsockopt(struct socket * sock,int level,int optname,char * optval,int * optlen)1044 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1045 char *optval, int *optlen)
1046 {
1047 return -EOPNOTSUPP;
1048 }
1049
1050 /*
1051 * Note: if you add something that sleeps here then change sock_fcntl()
1052 * to do proper fd locking.
1053 */
sock_no_fcntl(struct socket * sock,unsigned int cmd,unsigned long arg)1054 int sock_no_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
1055 {
1056 struct sock *sk = sock->sk;
1057
1058 switch(cmd)
1059 {
1060 case F_SETOWN:
1061 /*
1062 * This is a little restrictive, but it's the only
1063 * way to make sure that you can't send a sigurg to
1064 * another process.
1065 */
1066 if (current->pgrp != -arg &&
1067 current->pid != arg &&
1068 !capable(CAP_KILL)) return(-EPERM);
1069 sk->proc = arg;
1070 return(0);
1071 case F_GETOWN:
1072 return(sk->proc);
1073 default:
1074 return(-EINVAL);
1075 }
1076 }
1077
sock_no_sendmsg(struct socket * sock,struct msghdr * m,int flags,struct scm_cookie * scm)1078 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, int flags,
1079 struct scm_cookie *scm)
1080 {
1081 return -EOPNOTSUPP;
1082 }
1083
sock_no_recvmsg(struct socket * sock,struct msghdr * m,int len,int flags,struct scm_cookie * scm)1084 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, int len, int flags,
1085 struct scm_cookie *scm)
1086 {
1087 return -EOPNOTSUPP;
1088 }
1089
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)1090 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1091 {
1092 /* Mirror missing mmap method error code */
1093 return -ENODEV;
1094 }
1095
sock_no_sendpage(struct socket * sock,struct page * page,int offset,size_t size,int flags)1096 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1097 {
1098 ssize_t res;
1099 struct msghdr msg;
1100 struct iovec iov;
1101 mm_segment_t old_fs;
1102 char *kaddr;
1103
1104 kaddr = kmap(page);
1105
1106 msg.msg_name = NULL;
1107 msg.msg_namelen = 0;
1108 msg.msg_iov = &iov;
1109 msg.msg_iovlen = 1;
1110 msg.msg_control = NULL;
1111 msg.msg_controllen = 0;
1112 msg.msg_flags = flags;
1113
1114 iov.iov_base = kaddr + offset;
1115 iov.iov_len = size;
1116
1117 old_fs = get_fs();
1118 set_fs(KERNEL_DS);
1119 res = sock_sendmsg(sock, &msg, size);
1120 set_fs(old_fs);
1121
1122 kunmap(page);
1123 return res;
1124 }
1125
1126 /*
1127 * Default Socket Callbacks
1128 */
1129
sock_def_wakeup(struct sock * sk)1130 void sock_def_wakeup(struct sock *sk)
1131 {
1132 read_lock(&sk->callback_lock);
1133 if (sk->sleep && waitqueue_active(sk->sleep))
1134 wake_up_interruptible_all(sk->sleep);
1135 read_unlock(&sk->callback_lock);
1136 }
1137
sock_def_error_report(struct sock * sk)1138 void sock_def_error_report(struct sock *sk)
1139 {
1140 read_lock(&sk->callback_lock);
1141 if (sk->sleep && waitqueue_active(sk->sleep))
1142 wake_up_interruptible(sk->sleep);
1143 sk_wake_async(sk,0,POLL_ERR);
1144 read_unlock(&sk->callback_lock);
1145 }
1146
sock_def_readable(struct sock * sk,int len)1147 void sock_def_readable(struct sock *sk, int len)
1148 {
1149 read_lock(&sk->callback_lock);
1150 if (sk->sleep && waitqueue_active(sk->sleep))
1151 wake_up_interruptible(sk->sleep);
1152 sk_wake_async(sk,1,POLL_IN);
1153 read_unlock(&sk->callback_lock);
1154 }
1155
sock_def_write_space(struct sock * sk)1156 void sock_def_write_space(struct sock *sk)
1157 {
1158 read_lock(&sk->callback_lock);
1159
1160 /* Do not wake up a writer until he can make "significant"
1161 * progress. --DaveM
1162 */
1163 if((atomic_read(&sk->wmem_alloc) << 1) <= sk->sndbuf) {
1164 if (sk->sleep && waitqueue_active(sk->sleep))
1165 wake_up_interruptible(sk->sleep);
1166
1167 /* Should agree with poll, otherwise some programs break */
1168 if (sock_writeable(sk))
1169 sk_wake_async(sk, 2, POLL_OUT);
1170 }
1171
1172 read_unlock(&sk->callback_lock);
1173 }
1174
sock_def_destruct(struct sock * sk)1175 void sock_def_destruct(struct sock *sk)
1176 {
1177 if (sk->protinfo.destruct_hook)
1178 kfree(sk->protinfo.destruct_hook);
1179 }
1180
sock_init_data(struct socket * sock,struct sock * sk)1181 void sock_init_data(struct socket *sock, struct sock *sk)
1182 {
1183 skb_queue_head_init(&sk->receive_queue);
1184 skb_queue_head_init(&sk->write_queue);
1185 skb_queue_head_init(&sk->error_queue);
1186
1187 init_timer(&sk->timer);
1188
1189 sk->allocation = GFP_KERNEL;
1190 sk->rcvbuf = sysctl_rmem_default;
1191 sk->sndbuf = sysctl_wmem_default;
1192 sk->state = TCP_CLOSE;
1193 sk->zapped = 1;
1194 sk->socket = sock;
1195
1196 if(sock)
1197 {
1198 sk->type = sock->type;
1199 sk->sleep = &sock->wait;
1200 sock->sk = sk;
1201 } else
1202 sk->sleep = NULL;
1203
1204 sk->dst_lock = RW_LOCK_UNLOCKED;
1205 sk->callback_lock = RW_LOCK_UNLOCKED;
1206
1207 sk->state_change = sock_def_wakeup;
1208 sk->data_ready = sock_def_readable;
1209 sk->write_space = sock_def_write_space;
1210 sk->error_report = sock_def_error_report;
1211 sk->destruct = sock_def_destruct;
1212
1213 sk->peercred.pid = 0;
1214 sk->peercred.uid = -1;
1215 sk->peercred.gid = -1;
1216 sk->rcvlowat = 1;
1217 sk->rcvtimeo = MAX_SCHEDULE_TIMEOUT;
1218 sk->sndtimeo = MAX_SCHEDULE_TIMEOUT;
1219
1220 atomic_set(&sk->refcnt, 1);
1221 }
1222