1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Definitions for the AF_INET socket handler.
7  *
8  * Version:	@(#)sock.h	1.0.4	05/13/93
9  *
10  * Authors:	Ross Biro, <bir7@leland.Stanford.Edu>
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Corey Minyard <wf-rch!minyard@relay.EU.net>
13  *		Florian La Roche <flla@stud.uni-sb.de>
14  *
15  * Fixes:
16  *		Alan Cox	:	Volatiles in skbuff pointers. See
17  *					skbuff comments. May be overdone,
18  *					better to prove they can be removed
19  *					than the reverse.
20  *		Alan Cox	:	Added a zapped field for tcp to note
21  *					a socket is reset and must stay shut up
22  *		Alan Cox	:	New fields for options
23  *	Pauline Middelink	:	identd support
24  *		Alan Cox	:	Eliminate low level recv/recvfrom
25  *		David S. Miller	:	New socket lookup architecture.
26  *              Steve Whitehouse:       Default routines for sock_ops
27  *
28  *		This program is free software; you can redistribute it and/or
29  *		modify it under the terms of the GNU General Public License
30  *		as published by the Free Software Foundation; either version
31  *		2 of the License, or (at your option) any later version.
32  */
33 #ifndef _SOCK_H
34 #define _SOCK_H
35 
36 #include <linux/config.h>
37 #include <linux/timer.h>
38 #include <linux/cache.h>
39 #include <linux/in.h>		/* struct sockaddr_in */
40 
41 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
42 #include <linux/in6.h>		/* struct sockaddr_in6 */
43 #include <linux/ipv6.h>		/* dest_cache, inet6_options */
44 #include <linux/icmpv6.h>
45 #include <net/if_inet6.h>	/* struct ipv6_mc_socklist */
46 #endif
47 
48 #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
49 #include <linux/icmp.h>
50 #endif
51 #include <linux/tcp.h>		/* struct tcphdr */
52 #if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE)
53 #include <net/sctp/structs.h>	/* struct sctp_opt */
54 #endif
55 
56 #include <linux/netdevice.h>
57 #include <linux/skbuff.h>	/* struct sk_buff */
58 #include <net/protocol.h>		/* struct inet_protocol */
59 #if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE)
60 #include <net/x25.h>
61 #endif
62 #if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE)
63 #include <linux/if_wanpipe.h>
64 #endif
65 
66 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
67 #include <net/ax25.h>
68 #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
69 #include <net/netrom.h>
70 #endif
71 #if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE)
72 #include <net/rose.h>
73 #endif
74 #endif
75 
76 #if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE)
77 #include <linux/if_pppox.h>
78 #include <linux/ppp_channel.h>   /* struct ppp_channel */
79 #endif
80 
81 #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE)
82 #if defined(CONFIG_SPX) || defined(CONFIG_SPX_MODULE)
83 #include <net/spx.h>
84 #else
85 #include <net/ipx.h>
86 #endif /* CONFIG_SPX */
87 #endif /* CONFIG_IPX */
88 
89 #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE)
90 #include <linux/atalk.h>
91 #endif
92 
93 #if defined(CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE)
94 #include <net/dn.h>
95 #endif
96 
97 #if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE)
98 #include <net/irda/irda.h>
99 #endif
100 
101 #if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE)
102 struct atm_vcc;
103 #endif
104 
105 #ifdef CONFIG_FILTER
106 #include <linux/filter.h>
107 #endif
108 
109 #include <asm/atomic.h>
110 #include <net/dst.h>
111 
112 
113 /* The AF_UNIX specific socket options */
114 struct unix_opt {
115 	struct unix_address	*addr;
116 	struct dentry *		dentry;
117 	struct vfsmount *	mnt;
118 	struct semaphore	readsem;
119 	struct sock *		other;
120 	struct sock **		list;
121 	struct sock *		gc_tree;
122 	atomic_t		inflight;
123 	rwlock_t		lock;
124 	wait_queue_head_t	peer_wait;
125 };
126 
127 
128 /* Once the IPX ncpd patches are in these are going into protinfo. */
129 #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE)
130 struct ipx_opt {
131 	ipx_address		dest_addr;
132 	ipx_interface		*intrfc;
133 	unsigned short		port;
134 #ifdef CONFIG_IPX_INTERN
135 	unsigned char           node[IPX_NODE_LEN];
136 #endif
137 	unsigned short		type;
138 /*
139  * To handle special ncp connection-handling sockets for mars_nwe,
140  * the connection number must be stored in the socket.
141  */
142 	unsigned short		ipx_ncp_conn;
143 };
144 #endif
145 
146 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
147 struct ipv6_pinfo {
148 	struct in6_addr 	saddr;
149 	struct in6_addr 	rcv_saddr;
150 	struct in6_addr		daddr;
151 	struct in6_addr		*daddr_cache;
152 
153 	__u32			flow_label;
154 	__u32			frag_size;
155 	int			hop_limit;
156 	int			mcast_hops;
157 	int			mcast_oif;
158 
159 	/* pktoption flags */
160 	union {
161 		struct {
162 			__u8	srcrt:2,
163 			        rxinfo:1,
164 				rxhlim:1,
165 				hopopts:1,
166 				dstopts:1,
167                                 authhdr:1,
168                                 rxflow:1;
169 		} bits;
170 		__u8		all;
171 	} rxopt;
172 
173 	/* sockopt flags */
174 	__u8			mc_loop:1,
175 	                        recverr:1,
176 	                        sndflow:1,
177 	                        pmtudisc:2,
178 				ipv6only:1;
179 
180 	struct ipv6_mc_socklist	*ipv6_mc_list;
181 	struct ipv6_ac_socklist	*ipv6_ac_list;
182 	struct ipv6_fl_socklist *ipv6_fl_list;
183 	__u32			dst_cookie;
184 
185 	struct ipv6_txoptions	*opt;
186 	struct sk_buff		*pktoptions;
187 };
188 
189 struct raw6_opt {
190 	__u32			checksum;	/* perform checksum */
191 	__u32			offset;		/* checksum offset  */
192 
193 	struct icmp6_filter	filter;
194 };
195 
196 #define __ipv6_only_sock(sk)	((sk)->net_pinfo.af_inet6.ipv6only)
197 #define ipv6_only_sock(sk)	((sk)->family == PF_INET6 && \
198 				 (sk)->net_pinfo.af_inet6.ipv6only)
199 #else
200 #define __ipv6_only_sock(sk)	0
201 #define ipv6_only_sock(sk)	0
202 #endif /* IPV6 */
203 
204 #if defined(CONFIG_INET) || defined(CONFIG_INET_MODULE)
205 struct raw_opt {
206 	struct icmp_filter	filter;
207 };
208 #endif
209 
210 #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
211 struct inet_opt
212 {
213 	int			ttl;			/* TTL setting */
214 	int			tos;			/* TOS */
215 	unsigned	   	cmsg_flags;
216 	struct ip_options	*opt;
217 	unsigned char		hdrincl;		/* Include headers ? */
218 	__u8			mc_ttl;			/* Multicasting TTL */
219 	__u8			mc_loop;		/* Loopback */
220 	unsigned		recverr : 1,
221 				freebind : 1;
222 	__u16			id;			/* ID counter for DF pkts */
223 	__u8			pmtudisc;
224 	int			mc_index;		/* Multicast device index */
225 	__u32			mc_addr;
226 	struct ip_mc_socklist	*mc_list;		/* Group array */
227 };
228 #endif
229 
230 #if defined(CONFIG_PPPOE) || defined (CONFIG_PPPOE_MODULE)
231 struct pppoe_opt
232 {
233 	struct net_device      *dev;	  /* device associated with socket*/
234 	struct pppoe_addr	pa;	  /* what this socket is bound to*/
235 	struct sockaddr_pppox	relay;	  /* what socket data will be
236 					     relayed to (PPPoE relaying) */
237 };
238 
239 struct pppox_opt
240 {
241 	struct ppp_channel	chan;
242 	struct sock		*sk;
243 	struct pppox_opt	*next;	  /* for hash table */
244 	union {
245 		struct pppoe_opt pppoe;
246 	} proto;
247 };
248 #define pppoe_dev	proto.pppoe.dev
249 #define pppoe_pa	proto.pppoe.pa
250 #define pppoe_relay	proto.pppoe.relay
251 #endif
252 
253 /* This defines a selective acknowledgement block. */
254 struct tcp_sack_block {
255 	__u32	start_seq;
256 	__u32	end_seq;
257 };
258 
259 enum tcp_congestion_algo {
260  	TCP_RENO=0,
261  	TCP_VEGAS,
262  	TCP_WESTWOOD,
263  	TCP_BIC,
264 };
265 
266 struct tcp_opt {
267 	int	tcp_header_len;	/* Bytes of tcp header to send		*/
268 
269 /*
270  *	Header prediction flags
271  *	0x5?10 << 16 + snd_wnd in net byte order
272  */
273 	__u32	pred_flags;
274 
275 /*
276  *	RFC793 variables by their proper names. This means you can
277  *	read the code and the spec side by side (and laugh ...)
278  *	See RFC793 and RFC1122. The RFC writes these in capitals.
279  */
280  	__u32	rcv_nxt;	/* What we want to receive next 	*/
281  	__u32	snd_nxt;	/* Next sequence we send		*/
282 
283  	__u32	snd_una;	/* First byte we want an ack for	*/
284  	__u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
285 	__u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
286 	__u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
287 
288 	/* Delayed ACK control data */
289 	struct {
290 		__u8	pending;	/* ACK is pending */
291 		__u8	quick;		/* Scheduled number of quick acks	*/
292 		__u8	pingpong;	/* The session is interactive		*/
293 		__u8	blocked;	/* Delayed ACK was blocked by socket lock*/
294 		__u32	ato;		/* Predicted tick of soft clock		*/
295 		unsigned long timeout;	/* Currently scheduled timeout		*/
296 		__u32	lrcvtime;	/* timestamp of last received data packet*/
297 		__u16	last_seg_size;	/* Size of last incoming segment	*/
298 		__u16	rcv_mss;	/* MSS used for delayed ACK decisions	*/
299 	} ack;
300 
301 	/* Data for direct copy to user */
302 	struct {
303 		struct sk_buff_head	prequeue;
304 		struct task_struct	*task;
305 		struct iovec		*iov;
306 		int			memory;
307 		int			len;
308 	} ucopy;
309 
310 	__u32	snd_wl1;	/* Sequence for window update		*/
311 	__u32	snd_wnd;	/* The window we expect to receive	*/
312 	__u32	max_window;	/* Maximal window ever seen from peer	*/
313 	__u32	pmtu_cookie;	/* Last pmtu seen by socket		*/
314 	__u16	mss_cache;	/* Cached effective mss, not including SACKS */
315 	__u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
316 	__u16	ext_header_len;	/* Network protocol overhead (IP/IPv6 options) */
317 	__u8	ca_state;	/* State of fast-retransmit machine 	*/
318 	__u8	retransmits;	/* Number of unrecovered RTO timeouts.	*/
319 
320 	__u8	reordering;	/* Packet reordering metric.		*/
321 	__u8	queue_shrunk;	/* Write queue has been shrunk recently.*/
322 	__u8	defer_accept;	/* User waits for some data after accept() */
323 
324 /* RTT measurement */
325 	__u8	backoff;	/* backoff				*/
326 	__u32	srtt;		/* smothed round trip time << 3		*/
327 	__u32	mdev;		/* medium deviation			*/
328 	__u32	mdev_max;	/* maximal mdev for the last rtt period	*/
329 	__u32	rttvar;		/* smoothed mdev_max			*/
330 	__u32	rtt_seq;	/* sequence number to update rttvar	*/
331 	__u32	rto;		/* retransmit timeout			*/
332 
333 	__u32	packets_out;	/* Packets which are "in flight"	*/
334 	__u32	left_out;	/* Packets which leaved network		*/
335 	__u32	retrans_out;	/* Retransmitted packets out		*/
336 
337 
338 /*
339  *	Slow start and congestion control (see also Nagle, and Karn & Partridge)
340  */
341  	__u32	snd_ssthresh;	/* Slow start size threshold		*/
342  	__u32	snd_cwnd;	/* Sending congestion window		*/
343  	__u16	snd_cwnd_cnt;	/* Linear increase counter		*/
344 	__u16	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
345 	__u32	snd_cwnd_used;
346 	__u32	snd_cwnd_stamp;
347 
348 	/* Two commonly used timers in both sender and receiver paths. */
349 	unsigned long		timeout;
350  	struct timer_list	retransmit_timer;	/* Resend (no ack)	*/
351  	struct timer_list	delack_timer;		/* Ack delay 		*/
352 
353 	struct sk_buff_head	out_of_order_queue; /* Out of order segments go here */
354 
355 	struct tcp_func		*af_specific;	/* Operations which are AF_INET{4,6} specific	*/
356 	struct sk_buff		*send_head;	/* Front of stuff to transmit			*/
357 	struct page		*sndmsg_page;	/* Cached page for sendmsg			*/
358 	u32			sndmsg_off;	/* Cached offset for sendmsg			*/
359 
360  	__u32	rcv_wnd;	/* Current receiver window		*/
361 	__u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
362 	__u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
363 	__u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
364 	__u32	copied_seq;	/* Head of yet unread data		*/
365 /*
366  *      Options received (usually on last packet, some only on SYN packets).
367  */
368 	char	tstamp_ok,	/* TIMESTAMP seen on SYN packet		*/
369 		wscale_ok,	/* Wscale seen on SYN packet		*/
370 		sack_ok;	/* SACK seen on SYN packet		*/
371 	char	saw_tstamp;	/* Saw TIMESTAMP on last packet		*/
372         __u8	snd_wscale;	/* Window scaling received from sender	*/
373         __u8	rcv_wscale;	/* Window scaling to send to receiver	*/
374 	__u8	nonagle;	/* Disable Nagle algorithm?             */
375 	__u8	keepalive_probes; /* num of allowed keep alive probes	*/
376 
377 /*	PAWS/RTTM data	*/
378         __u32	rcv_tsval;	/* Time stamp value             	*/
379         __u32	rcv_tsecr;	/* Time stamp echo reply        	*/
380         __u32	ts_recent;	/* Time stamp to echo next		*/
381         long	ts_recent_stamp;/* Time we stored ts_recent (for aging) */
382 
383 /*	SACKs data	*/
384 	__u16	user_mss;  	/* mss requested by user in ioctl */
385 	__u8	dsack;		/* D-SACK is scheduled			*/
386 	__u8	eff_sacks;	/* Size of SACK array to send with next packet */
387 	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
388 	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
389 
390 	__u32	window_clamp;	/* Maximal window to advertise		*/
391 	__u32	rcv_ssthresh;	/* Current window clamp			*/
392 	__u8	probes_out;	/* unanswered 0 window probes		*/
393 	__u8	num_sacks;	/* Number of SACK blocks		*/
394 	__u16	advmss;		/* Advertised MSS			*/
395 
396 	__u8	syn_retries;	/* num of allowed syn retries */
397 	__u8	ecn_flags;	/* ECN status bits.			*/
398 	__u16	prior_ssthresh; /* ssthresh saved at recovery start	*/
399 	__u32	lost_out;	/* Lost packets				*/
400 	__u32	sacked_out;	/* SACK'd packets			*/
401 	__u32	fackets_out;	/* FACK'd packets			*/
402 	__u32	high_seq;	/* snd_nxt at onset of congestion	*/
403 
404 	__u32	retrans_stamp;	/* Timestamp of the last retransmit,
405 				 * also used in SYN-SENT to remember stamp of
406 				 * the first SYN. */
407 	__u32	undo_marker;	/* tracking retrans started here. */
408 	int	undo_retrans;	/* number of undoable retransmissions. */
409 	__u32	urg_seq;	/* Seq of received urgent pointer */
410 	__u16	urg_data;	/* Saved octet of OOB data and control flags */
411 	__u8	pending;	/* Scheduled timer event	*/
412 	__u8	urg_mode;	/* In urgent mode		*/
413 	__u32	snd_up;		/* Urgent pointer		*/
414 
415 	/* The syn_wait_lock is necessary only to avoid tcp_get_info having
416 	 * to grab the main lock sock while browsing the listening hash
417 	 * (otherwise it's deadlock prone).
418 	 * This lock is acquired in read mode only from tcp_get_info() and
419 	 * it's acquired in write mode _only_ from code that is actively
420 	 * changing the syn_wait_queue. All readers that are holding
421 	 * the master sock lock don't need to grab this lock in read mode
422 	 * too as the syn_wait_queue writes are always protected from
423 	 * the main sock lock.
424 	 */
425 	rwlock_t		syn_wait_lock;
426 	struct tcp_listen_opt	*listen_opt;
427 
428 	/* FIFO of established children */
429 	struct open_request	*accept_queue;
430 	struct open_request	*accept_queue_tail;
431 
432 	int			write_pending;	/* A write to socket waits to start. */
433 
434 	unsigned int		keepalive_time;	  /* time before keep alive takes place */
435 	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */
436 	int			linger2;
437 
438 	__u8			adv_cong;    /* Using Vegas, Westwood, or BIC */
439 	__u8                    frto_counter; /* Number of new acks after RTO */
440 	__u32                   frto_highmark; /* snd_nxt when RTO occurred */
441 
442 	unsigned long last_synq_overflow;
443 
444 /* Receiver side RTT estimation */
445 	struct {
446 		__u32	rtt;
447 		__u32	seq;
448 		__u32	time;
449 	} rcv_rtt_est;
450 
451 /* Receiver queue space */
452 	struct {
453 		int	space;
454 		__u32	seq;
455 		__u32	time;
456 	} rcvq_space;
457 
458 /* TCP Westwood structure */
459         struct {
460                 __u32    bw_ns_est;        /* first bandwidth estimation..not too smoothed 8) */
461                 __u32    bw_est;           /* bandwidth estimate */
462                 __u32    rtt_win_sx;       /* here starts a new evaluation... */
463                 __u32    bk;
464                 __u32    snd_una;          /* used for evaluating the number of acked bytes */
465                 __u32    cumul_ack;
466                 __u32    accounted;
467                 __u32    rtt;
468                 __u32    rtt_min;          /* minimum observed RTT */
469         } westwood;
470 
471 /* Vegas variables */
472 	struct {
473 		__u32	beg_snd_nxt;	/* right edge during last RTT */
474 		__u32	beg_snd_una;	/* left edge  during last RTT */
475 		__u32	beg_snd_cwnd;	/* saves the size of the cwnd */
476 		__u8	doing_vegas_now;/* if true, do vegas for this RTT */
477 		__u16	cntRTT;		/* # of RTTs measured within last RTT */
478 		__u32	minRTT;		/* min of RTTs measured within last RTT (in usec) */
479 		__u32	baseRTT;	/* the min of all Vegas RTT measurements seen (in usec) */
480 	} vegas;
481 
482 	/* BI TCP Parameters */
483 	struct {
484 		__u32	cnt;		/* increase cwnd by 1 after this number of ACKs */
485 		__u32 	last_max_cwnd;	/* last maximium snd_cwnd */
486 		__u32	last_cwnd;	/* the last snd_cwnd */
487 		__u32   last_stamp;     /* time when updated last_cwnd */
488 	} bictcp;
489 };
490 
491 
492 /*
493  * This structure really needs to be cleaned up.
494  * Most of it is for TCP, and not used by any of
495  * the other protocols.
496  */
497 
498 /*
499  * The idea is to start moving to a newer struct gradualy
500  *
501  * IMHO the newer struct should have the following format:
502  *
503  *	struct sock {
504  *		sockmem [mem, proto, callbacks]
505  *
506  *		union or struct {
507  *			ax25;
508  *		} ll_pinfo;
509  *
510  *		union {
511  *			ipv4;
512  *			ipv6;
513  *			ipx;
514  *			netrom;
515  *			rose;
516  * 			x25;
517  *		} net_pinfo;
518  *
519  *		union {
520  *			tcp;
521  *			udp;
522  *			spx;
523  *			netrom;
524  *		} tp_pinfo;
525  *
526  *	}
527  *
528  * The idea failed because IPv6 transition asssumes dual IP/IPv6 sockets.
529  * So, net_pinfo is IPv6 are really, and protinfo unifies all another
530  * private areas.
531  */
532 
533 /* Define this to get the sk->debug debugging facility. */
534 #define SOCK_DEBUGGING
535 #ifdef SOCK_DEBUGGING
536 #define SOCK_DEBUG(sk, msg...) do { if((sk) && ((sk)->debug)) printk(KERN_DEBUG msg); } while (0)
537 #else
538 #define SOCK_DEBUG(sk, msg...) do { } while (0)
539 #endif
540 
541 /* This is the per-socket lock.  The spinlock provides a synchronization
542  * between user contexts and software interrupt processing, whereas the
543  * mini-semaphore synchronizes multiple users amongst themselves.
544  */
545 typedef struct {
546 	spinlock_t		slock;
547 	unsigned int		users;
548 	wait_queue_head_t	wq;
549 } socket_lock_t;
550 
551 #define sock_lock_init(__sk) \
552 do {	spin_lock_init(&((__sk)->lock.slock)); \
553 	(__sk)->lock.users = 0; \
554 	init_waitqueue_head(&((__sk)->lock.wq)); \
555 } while(0)
556 
557 struct sock {
558 	/* Socket demultiplex comparisons on incoming packets. */
559 	__u32			daddr;		/* Foreign IPv4 addr			*/
560 	__u32			rcv_saddr;	/* Bound local IPv4 addr		*/
561 	__u16			dport;		/* Destination port			*/
562 	unsigned short		num;		/* Local port				*/
563 	int			bound_dev_if;	/* Bound device index if != 0		*/
564 
565 	/* Main hash linkage for various protocol lookup tables. */
566 	struct sock		*next;
567 	struct sock		**pprev;
568 	struct sock		*bind_next;
569 	struct sock		**bind_pprev;
570 
571 	volatile unsigned char	state,		/* Connection state			*/
572 				zapped;		/* In ax25 & ipx means not linked	*/
573 	__u16			sport;		/* Source port				*/
574 
575 	unsigned short		family;		/* Address family			*/
576 	unsigned char		reuse;		/* SO_REUSEADDR setting			*/
577 	unsigned char		shutdown;
578 	atomic_t		refcnt;		/* Reference count			*/
579 
580 	socket_lock_t		lock;		/* Synchronizer...			*/
581 	int			rcvbuf;		/* Size of receive buffer in bytes	*/
582 
583 	wait_queue_head_t	*sleep;		/* Sock wait queue			*/
584 	struct dst_entry	*dst_cache;	/* Destination cache			*/
585 	rwlock_t		dst_lock;
586 	atomic_t		rmem_alloc;	/* Receive queue bytes committed	*/
587 	struct sk_buff_head	receive_queue;	/* Incoming packets			*/
588 	atomic_t		wmem_alloc;	/* Transmit queue bytes committed	*/
589 	struct sk_buff_head	write_queue;	/* Packet sending queue			*/
590 	atomic_t		omem_alloc;	/* "o" is "option" or "other" */
591 	int			wmem_queued;	/* Persistent queue size */
592 	int			forward_alloc;	/* Space allocated forward. */
593 	__u32			saddr;		/* Sending source			*/
594 	unsigned int		allocation;	/* Allocation mode			*/
595 	int			sndbuf;		/* Size of send buffer in bytes		*/
596 	struct sock		*prev;
597 
598 	/* Not all are volatile, but some are, so we might as well say they all are.
599 	 * XXX Make this a flag word -DaveM
600 	 */
601 	volatile char		dead,
602 				done,
603 				urginline,
604 				keepopen,
605 				linger,
606 				destroy,
607 				no_check,
608 				broadcast,
609 				bsdism;
610 	unsigned char		debug;
611 	unsigned char		rcvtstamp;
612 	unsigned char		use_write_queue;
613 	unsigned char		userlocks;
614 	/* Hole of 3 bytes. Try to pack. */
615 	int			route_caps;
616 	int			proc;
617 	unsigned long	        lingertime;
618 
619 	int			hashent;
620 	struct sock		*pair;
621 
622 	/* The backlog queue is special, it is always used with
623 	 * the per-socket spinlock held and requires low latency
624 	 * access.  Therefore we special case it's implementation.
625 	 */
626 	struct {
627 		struct sk_buff *head;
628 		struct sk_buff *tail;
629 	} backlog;
630 
631 	rwlock_t		callback_lock;
632 
633 	/* Error queue, rarely used. */
634 	struct sk_buff_head	error_queue;
635 
636 	struct proto		*prot;
637 
638 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
639 	union {
640 		struct ipv6_pinfo	af_inet6;
641 	} net_pinfo;
642 #endif
643 
644 	union {
645 		struct tcp_opt		af_tcp;
646 #if defined(CONFIG_IP_SCTP) || defined (CONFIG_IP_SCTP_MODULE)
647 		struct sctp_opt		af_sctp;
648 #endif
649 #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
650 		struct raw_opt		tp_raw4;
651 #endif
652 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
653 		struct raw6_opt		tp_raw;
654 #endif /* CONFIG_IPV6 */
655 #if defined(CONFIG_SPX) || defined (CONFIG_SPX_MODULE)
656 		struct spx_opt		af_spx;
657 #endif /* CONFIG_SPX */
658 
659 	} tp_pinfo;
660 
661 	int			err, err_soft;	/* Soft holds errors that don't
662 						   cause failure but are the cause
663 						   of a persistent failure not just
664 						   'timed out' */
665 	unsigned short		ack_backlog;
666 	unsigned short		max_ack_backlog;
667 	__u32			priority;
668 	unsigned short		type;
669 	unsigned char		localroute;	/* Route locally only */
670 	unsigned char		protocol;
671 	struct ucred		peercred;
672 	int			rcvlowat;
673 	long			rcvtimeo;
674 	long			sndtimeo;
675 
676 #ifdef CONFIG_FILTER
677 	/* Socket Filtering Instructions */
678 	struct sk_filter      	*filter;
679 #endif /* CONFIG_FILTER */
680 
681 	/* This is where all the private (optional) areas that don't
682 	 * overlap will eventually live.
683 	 */
684 	union {
685 		void *destruct_hook;
686 	  	struct unix_opt	af_unix;
687 #if defined(CONFIG_INET) || defined (CONFIG_INET_MODULE)
688 		struct inet_opt af_inet;
689 #endif
690 #if defined(CONFIG_ATALK) || defined(CONFIG_ATALK_MODULE)
691 		struct atalk_sock	af_at;
692 #endif
693 #if defined(CONFIG_IPX) || defined(CONFIG_IPX_MODULE)
694 		struct ipx_opt		af_ipx;
695 #endif
696 #if defined (CONFIG_DECNET) || defined(CONFIG_DECNET_MODULE)
697 		struct dn_scp           dn;
698 #endif
699 #if defined (CONFIG_PACKET) || defined(CONFIG_PACKET_MODULE)
700 		struct packet_opt	*af_packet;
701 #endif
702 #if defined(CONFIG_X25) || defined(CONFIG_X25_MODULE)
703 		x25_cb			*x25;
704 #endif
705 #if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
706 		ax25_cb			*ax25;
707 #endif
708 #if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE)
709 		nr_cb			*nr;
710 #endif
711 #if defined(CONFIG_ROSE) || defined(CONFIG_ROSE_MODULE)
712 		rose_cb			*rose;
713 #endif
714 #if defined(CONFIG_PPPOE) || defined(CONFIG_PPPOE_MODULE)
715 		struct pppox_opt	*pppox;
716 #endif
717 		struct netlink_opt	*af_netlink;
718 #if defined(CONFIG_ECONET) || defined(CONFIG_ECONET_MODULE)
719 		struct econet_opt	*af_econet;
720 #endif
721 #if defined(CONFIG_ATM) || defined(CONFIG_ATM_MODULE)
722 		struct atm_vcc		*af_atm;
723 #endif
724 #if defined(CONFIG_IRDA) || defined(CONFIG_IRDA_MODULE)
725 		struct irda_sock        *irda;
726 #endif
727 #if defined(CONFIG_WAN_ROUTER) || defined(CONFIG_WAN_ROUTER_MODULE)
728                struct wanpipe_opt      *af_wanpipe;
729 #endif
730 	} protinfo;
731 
732 
733 	/* This part is used for the timeout functions. */
734 	struct timer_list	timer;		/* This is the sock cleanup timer. */
735 	struct timeval		stamp;
736 
737 	/* Identd and reporting IO signals */
738 	struct socket		*socket;
739 
740 	/* RPC layer private data */
741 	void			*user_data;
742 
743 	/* Callbacks */
744 	void			(*state_change)(struct sock *sk);
745 	void			(*data_ready)(struct sock *sk,int bytes);
746 	void			(*write_space)(struct sock *sk);
747 	void			(*error_report)(struct sock *sk);
748 
749   	int			(*backlog_rcv) (struct sock *sk,
750 						struct sk_buff *skb);
751 	void                    (*destruct)(struct sock *sk);
752 };
753 
754 /* The per-socket spinlock must be held here. */
755 #define sk_add_backlog(__sk, __skb)			\
756 do {	if((__sk)->backlog.tail == NULL) {		\
757 		(__sk)->backlog.head =			\
758 		     (__sk)->backlog.tail = (__skb);	\
759 	} else {					\
760 		((__sk)->backlog.tail)->next = (__skb);	\
761 		(__sk)->backlog.tail = (__skb);		\
762 	}						\
763 	(__skb)->next = NULL;				\
764 } while(0)
765 
766 /* IP protocol blocks we attach to sockets.
767  * socket layer -> transport layer interface
768  * transport -> network interface is defined by struct inet_proto
769  */
770 struct proto {
771 	void			(*close)(struct sock *sk,
772 					long timeout);
773 	int			(*connect)(struct sock *sk,
774 				        struct sockaddr *uaddr,
775 					int addr_len);
776 	int			(*disconnect)(struct sock *sk, int flags);
777 
778 	struct sock *		(*accept) (struct sock *sk, int flags, int *err);
779 
780 	int			(*ioctl)(struct sock *sk, int cmd,
781 					 unsigned long arg);
782 	int			(*init)(struct sock *sk);
783 	int			(*destroy)(struct sock *sk);
784 	void			(*shutdown)(struct sock *sk, int how);
785 	int			(*setsockopt)(struct sock *sk, int level,
786 					int optname, char *optval, int optlen);
787 	int			(*getsockopt)(struct sock *sk, int level,
788 					int optname, char *optval,
789 					int *option);
790 	int			(*sendmsg)(struct sock *sk, struct msghdr *msg,
791 					   int len);
792 	int			(*recvmsg)(struct sock *sk, struct msghdr *msg,
793 					int len, int noblock, int flags,
794 					int *addr_len);
795 	int			(*bind)(struct sock *sk,
796 					struct sockaddr *uaddr, int addr_len);
797 
798 	int			(*backlog_rcv) (struct sock *sk,
799 						struct sk_buff *skb);
800 
801 	/* Keeping track of sk's, looking them up, and port selection methods. */
802 	void			(*hash)(struct sock *sk);
803 	void			(*unhash)(struct sock *sk);
804 	int			(*get_port)(struct sock *sk, unsigned short snum);
805 
806 	char			name[32];
807 
808 	struct {
809 		int inuse;
810 		u8  __pad[SMP_CACHE_BYTES - sizeof(int)];
811 	} stats[NR_CPUS];
812 };
813 
814 /* Called with local bh disabled */
sock_prot_inc_use(struct proto * prot)815 static __inline__ void sock_prot_inc_use(struct proto *prot)
816 {
817 	prot->stats[smp_processor_id()].inuse++;
818 }
819 
sock_prot_dec_use(struct proto * prot)820 static __inline__ void sock_prot_dec_use(struct proto *prot)
821 {
822 	prot->stats[smp_processor_id()].inuse--;
823 }
824 
825 /* About 10 seconds */
826 #define SOCK_DESTROY_TIME (10*HZ)
827 
828 /* Sockets 0-1023 can't be bound to unless you are superuser */
829 #define PROT_SOCK	1024
830 
831 #define SHUTDOWN_MASK	3
832 #define RCV_SHUTDOWN	1
833 #define SEND_SHUTDOWN	2
834 
835 #define SOCK_SNDBUF_LOCK	1
836 #define SOCK_RCVBUF_LOCK	2
837 #define SOCK_BINDADDR_LOCK	4
838 #define SOCK_BINDPORT_LOCK	8
839 
840 
841 /* Used by processes to "lock" a socket state, so that
842  * interrupts and bottom half handlers won't change it
843  * from under us. It essentially blocks any incoming
844  * packets, so that we won't get any new data or any
845  * packets that change the state of the socket.
846  *
847  * While locked, BH processing will add new packets to
848  * the backlog queue.  This queue is processed by the
849  * owner of the socket lock right before it is released.
850  *
851  * Since ~2.3.5 it is also exclusive sleep lock serializing
852  * accesses from user process context.
853  */
854 extern void __lock_sock(struct sock *sk);
855 extern void __release_sock(struct sock *sk);
856 #define lock_sock(__sk) \
857 do {	spin_lock_bh(&((__sk)->lock.slock)); \
858 	if ((__sk)->lock.users != 0) \
859 		__lock_sock(__sk); \
860 	(__sk)->lock.users = 1; \
861 	spin_unlock_bh(&((__sk)->lock.slock)); \
862 } while(0)
863 
864 #define release_sock(__sk) \
865 do {	spin_lock_bh(&((__sk)->lock.slock)); \
866 	if ((__sk)->backlog.tail != NULL) \
867 		__release_sock(__sk); \
868 	(__sk)->lock.users = 0; \
869         if (waitqueue_active(&((__sk)->lock.wq))) wake_up(&((__sk)->lock.wq)); \
870 	spin_unlock_bh(&((__sk)->lock.slock)); \
871 } while(0)
872 
873 /* BH context may only use the following locking interface. */
874 #define bh_lock_sock(__sk)	spin_lock(&((__sk)->lock.slock))
875 #define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->lock.slock))
876 
877 extern struct sock *		sk_alloc(int family, int priority, int zero_it);
878 extern void			sk_free(struct sock *sk);
879 
880 extern struct sk_buff		*sock_wmalloc(struct sock *sk,
881 					      unsigned long size, int force,
882 					      int priority);
883 extern struct sk_buff		*sock_rmalloc(struct sock *sk,
884 					      unsigned long size, int force,
885 					      int priority);
886 extern void			sock_wfree(struct sk_buff *skb);
887 extern void			sock_rfree(struct sk_buff *skb);
888 
889 extern int			sock_setsockopt(struct socket *sock, int level,
890 						int op, char *optval,
891 						int optlen);
892 
893 extern int			sock_getsockopt(struct socket *sock, int level,
894 						int op, char *optval,
895 						int *optlen);
896 extern struct sk_buff 		*sock_alloc_send_skb(struct sock *sk,
897 						     unsigned long size,
898 						     int noblock,
899 						     int *errcode);
900 extern struct sk_buff 		*sock_alloc_send_pskb(struct sock *sk,
901 						      unsigned long header_len,
902 						      unsigned long data_len,
903 						      int noblock,
904 						      int *errcode);
905 extern void *sock_kmalloc(struct sock *sk, int size, int priority);
906 extern void sock_kfree_s(struct sock *sk, void *mem, int size);
907 
908 /*
909  * Functions to fill in entries in struct proto_ops when a protocol
910  * does not implement a particular function.
911  */
912 extern int                      sock_no_release(struct socket *);
913 extern int                      sock_no_bind(struct socket *,
914 					     struct sockaddr *, int);
915 extern int                      sock_no_connect(struct socket *,
916 						struct sockaddr *, int, int);
917 extern int                      sock_no_socketpair(struct socket *,
918 						   struct socket *);
919 extern int                      sock_no_accept(struct socket *,
920 					       struct socket *, int);
921 extern int                      sock_no_getname(struct socket *,
922 						struct sockaddr *, int *, int);
923 extern unsigned int             sock_no_poll(struct file *, struct socket *,
924 					     struct poll_table_struct *);
925 extern int                      sock_no_ioctl(struct socket *, unsigned int,
926 					      unsigned long);
927 extern int			sock_no_listen(struct socket *, int);
928 extern int                      sock_no_shutdown(struct socket *, int);
929 extern int			sock_no_getsockopt(struct socket *, int , int,
930 						   char *, int *);
931 extern int			sock_no_setsockopt(struct socket *, int, int,
932 						   char *, int);
933 extern int 			sock_no_fcntl(struct socket *,
934 					      unsigned int, unsigned long);
935 extern int                      sock_no_sendmsg(struct socket *,
936 						struct msghdr *, int,
937 						struct scm_cookie *);
938 extern int                      sock_no_recvmsg(struct socket *,
939 						struct msghdr *, int, int,
940 						struct scm_cookie *);
941 extern int			sock_no_mmap(struct file *file,
942 					     struct socket *sock,
943 					     struct vm_area_struct *vma);
944 extern ssize_t			sock_no_sendpage(struct socket *sock,
945 						struct page *page,
946 						int offset, size_t size,
947 						int flags);
948 
949 /*
950  *	Default socket callbacks and setup code
951  */
952 
953 extern void sock_def_destruct(struct sock *);
954 
955 /* Initialise core socket variables */
956 extern void sock_init_data(struct socket *sock, struct sock *sk);
957 
958 extern void sklist_remove_socket(struct sock **list, struct sock *sk);
959 extern void sklist_insert_socket(struct sock **list, struct sock *sk);
960 extern void sklist_destroy_socket(struct sock **list, struct sock *sk);
961 
962 #ifdef CONFIG_FILTER
963 
964 /**
965  *	sk_filter - run a packet through a socket filter
966  *	@sk: sock associated with &sk_buff
967  *	@skb: buffer to filter
968  *	@needlock: set to 1 if the sock is not locked by caller.
969  *
970  * Run the filter code and then cut skb->data to correct size returned by
971  * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
972  * than pkt_len we keep whole skb->data. This is the socket level
973  * wrapper to sk_run_filter. It returns 0 if the packet should
974  * be accepted or -EPERM if the packet should be tossed.
975  */
976 
sk_filter(struct sock * sk,struct sk_buff * skb,int needlock)977 static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
978 {
979 	int err = 0;
980 
981 	if (sk->filter) {
982 		struct sk_filter *filter;
983 
984 		if (needlock)
985 			bh_lock_sock(sk);
986 
987 		filter = sk->filter;
988 		if (filter) {
989 			int pkt_len = sk_run_filter(skb, filter->insns,
990 						    filter->len);
991 			if (!pkt_len)
992 				err = -EPERM;
993 			else
994 				skb_trim(skb, pkt_len);
995 		}
996 
997 		if (needlock)
998 			bh_unlock_sock(sk);
999 	}
1000 	return err;
1001 }
1002 
1003 /**
1004  *	sk_filter_release: Release a socket filter
1005  *	@sk: socket
1006  *	@fp: filter to remove
1007  *
1008  *	Remove a filter from a socket and release its resources.
1009  */
1010 
sk_filter_release(struct sock * sk,struct sk_filter * fp)1011 static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp)
1012 {
1013 	unsigned int size = sk_filter_len(fp);
1014 
1015 	atomic_sub(size, &sk->omem_alloc);
1016 
1017 	if (atomic_dec_and_test(&fp->refcnt))
1018 		kfree(fp);
1019 }
1020 
sk_filter_charge(struct sock * sk,struct sk_filter * fp)1021 static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp)
1022 {
1023 	atomic_inc(&fp->refcnt);
1024 	atomic_add(sk_filter_len(fp), &sk->omem_alloc);
1025 }
1026 
1027 #else
1028 
sk_filter(struct sock * sk,struct sk_buff * skb,int needlock)1029 static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock)
1030 {
1031 	return 0;
1032 }
1033 
1034 #endif /* CONFIG_FILTER */
1035 
1036 /*
1037  * Socket reference counting postulates.
1038  *
1039  * * Each user of socket SHOULD hold a reference count.
1040  * * Each access point to socket (an hash table bucket, reference from a list,
1041  *   running timer, skb in flight MUST hold a reference count.
1042  * * When reference count hits 0, it means it will never increase back.
1043  * * When reference count hits 0, it means that no references from
1044  *   outside exist to this socket and current process on current CPU
1045  *   is last user and may/should destroy this socket.
1046  * * sk_free is called from any context: process, BH, IRQ. When
1047  *   it is called, socket has no references from outside -> sk_free
1048  *   may release descendant resources allocated by the socket, but
1049  *   to the time when it is called, socket is NOT referenced by any
1050  *   hash tables, lists etc.
1051  * * Packets, delivered from outside (from network or from another process)
1052  *   and enqueued on receive/error queues SHOULD NOT grab reference count,
1053  *   when they sit in queue. Otherwise, packets will leak to hole, when
1054  *   socket is looked up by one cpu and unhasing is made by another CPU.
1055  *   It is true for udp/raw, netlink (leak to receive and error queues), tcp
1056  *   (leak to backlog). Packet socket does all the processing inside
1057  *   BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets
1058  *   use separate SMP lock, so that they are prone too.
1059  */
1060 
1061 /* Grab socket reference count. This operation is valid only
1062    when sk is ALREADY grabbed f.e. it is found in hash table
1063    or a list and the lookup is made under lock preventing hash table
1064    modifications.
1065  */
1066 
sock_hold(struct sock * sk)1067 static inline void sock_hold(struct sock *sk)
1068 {
1069 	atomic_inc(&sk->refcnt);
1070 }
1071 
1072 /* Ungrab socket in the context, which assumes that socket refcnt
1073    cannot hit zero, f.e. it is true in context of any socketcall.
1074  */
__sock_put(struct sock * sk)1075 static inline void __sock_put(struct sock *sk)
1076 {
1077 	atomic_dec(&sk->refcnt);
1078 }
1079 
1080 /* Ungrab socket and destroy it, if it was the last reference. */
sock_put(struct sock * sk)1081 static inline void sock_put(struct sock *sk)
1082 {
1083 	if (atomic_dec_and_test(&sk->refcnt))
1084 		sk_free(sk);
1085 }
1086 
1087 /* Detach socket from process context.
1088  * Announce socket dead, detach it from wait queue and inode.
1089  * Note that parent inode held reference count on this struct sock,
1090  * we do not release it in this function, because protocol
1091  * probably wants some additional cleanups or even continuing
1092  * to work with this socket (TCP).
1093  */
sock_orphan(struct sock * sk)1094 static inline void sock_orphan(struct sock *sk)
1095 {
1096 	write_lock_bh(&sk->callback_lock);
1097 	sk->dead = 1;
1098 	sk->socket = NULL;
1099 	sk->sleep = NULL;
1100 	write_unlock_bh(&sk->callback_lock);
1101 }
1102 
sock_graft(struct sock * sk,struct socket * parent)1103 static inline void sock_graft(struct sock *sk, struct socket *parent)
1104 {
1105 	write_lock_bh(&sk->callback_lock);
1106 	sk->sleep = &parent->wait;
1107 	parent->sk = sk;
1108 	sk->socket = parent;
1109 	write_unlock_bh(&sk->callback_lock);
1110 }
1111 
sock_i_uid(struct sock * sk)1112 static inline int sock_i_uid(struct sock *sk)
1113 {
1114 	int uid;
1115 
1116 	read_lock(&sk->callback_lock);
1117 	uid = sk->socket ? sk->socket->inode->i_uid : 0;
1118 	read_unlock(&sk->callback_lock);
1119 	return uid;
1120 }
1121 
sock_i_ino(struct sock * sk)1122 static inline unsigned long sock_i_ino(struct sock *sk)
1123 {
1124 	unsigned long ino;
1125 
1126 	read_lock(&sk->callback_lock);
1127 	ino = sk->socket ? sk->socket->inode->i_ino : 0;
1128 	read_unlock(&sk->callback_lock);
1129 	return ino;
1130 }
1131 
1132 static inline struct dst_entry *
__sk_dst_get(struct sock * sk)1133 __sk_dst_get(struct sock *sk)
1134 {
1135 	return sk->dst_cache;
1136 }
1137 
1138 static inline struct dst_entry *
sk_dst_get(struct sock * sk)1139 sk_dst_get(struct sock *sk)
1140 {
1141 	struct dst_entry *dst;
1142 
1143 	read_lock(&sk->dst_lock);
1144 	dst = sk->dst_cache;
1145 	if (dst)
1146 		dst_hold(dst);
1147 	read_unlock(&sk->dst_lock);
1148 	return dst;
1149 }
1150 
1151 static inline void
__sk_dst_set(struct sock * sk,struct dst_entry * dst)1152 __sk_dst_set(struct sock *sk, struct dst_entry *dst)
1153 {
1154 	struct dst_entry *old_dst;
1155 
1156 	old_dst = sk->dst_cache;
1157 	sk->dst_cache = dst;
1158 	dst_release(old_dst);
1159 }
1160 
1161 static inline void
sk_dst_set(struct sock * sk,struct dst_entry * dst)1162 sk_dst_set(struct sock *sk, struct dst_entry *dst)
1163 {
1164 	write_lock(&sk->dst_lock);
1165 	__sk_dst_set(sk, dst);
1166 	write_unlock(&sk->dst_lock);
1167 }
1168 
1169 static inline void
__sk_dst_reset(struct sock * sk)1170 __sk_dst_reset(struct sock *sk)
1171 {
1172 	struct dst_entry *old_dst;
1173 
1174 	old_dst = sk->dst_cache;
1175 	sk->dst_cache = NULL;
1176 	dst_release(old_dst);
1177 }
1178 
1179 static inline void
sk_dst_reset(struct sock * sk)1180 sk_dst_reset(struct sock *sk)
1181 {
1182 	write_lock(&sk->dst_lock);
1183 	__sk_dst_reset(sk);
1184 	write_unlock(&sk->dst_lock);
1185 }
1186 
1187 static inline struct dst_entry *
__sk_dst_check(struct sock * sk,u32 cookie)1188 __sk_dst_check(struct sock *sk, u32 cookie)
1189 {
1190 	struct dst_entry *dst = sk->dst_cache;
1191 
1192 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
1193 		sk->dst_cache = NULL;
1194 		return NULL;
1195 	}
1196 
1197 	return dst;
1198 }
1199 
1200 static inline struct dst_entry *
sk_dst_check(struct sock * sk,u32 cookie)1201 sk_dst_check(struct sock *sk, u32 cookie)
1202 {
1203 	struct dst_entry *dst = sk_dst_get(sk);
1204 
1205 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
1206 		sk_dst_reset(sk);
1207 		return NULL;
1208 	}
1209 
1210 	return dst;
1211 }
1212 
1213 
1214 /*
1215  * 	Queue a received datagram if it will fit. Stream and sequenced
1216  *	protocols can't normally use this as they need to fit buffers in
1217  *	and play with them.
1218  *
1219  * 	Inlined as it's very short and called for pretty much every
1220  *	packet ever received.
1221  */
1222 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)1223 static inline void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1224 {
1225 	sock_hold(sk);
1226 	skb->sk = sk;
1227 	skb->destructor = sock_wfree;
1228 	atomic_add(skb->truesize, &sk->wmem_alloc);
1229 }
1230 
skb_set_owner_r(struct sk_buff * skb,struct sock * sk)1231 static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
1232 {
1233 	skb->sk = sk;
1234 	skb->destructor = sock_rfree;
1235 	atomic_add(skb->truesize, &sk->rmem_alloc);
1236 }
1237 
sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)1238 static inline int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
1239 {
1240 	int err = 0;
1241 	int skb_len;
1242 
1243 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
1244 	   number of warnings when compiling with -W --ANK
1245 	 */
1246 	if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf) {
1247 		err = -ENOMEM;
1248 		goto out;
1249 	}
1250 
1251 	/* It would be deadlock, if sock_queue_rcv_skb is used
1252 	   with socket lock! We assume that users of this
1253 	   function are lock free.
1254 	*/
1255 	err = sk_filter(sk, skb, 1);
1256 	if (err)
1257 		goto out;
1258 
1259 	skb->dev = NULL;
1260 	skb_set_owner_r(skb, sk);
1261 
1262 	/* Cache the SKB length before we tack it onto the receive
1263 	 * queue.  Once it is added it no longer belongs to us and
1264 	 * may be freed by other threads of control pulling packets
1265 	 * from the queue.
1266 	 */
1267 	skb_len = skb->len;
1268 
1269 	skb_queue_tail(&sk->receive_queue, skb);
1270 	if (!sk->dead)
1271 		sk->data_ready(sk,skb_len);
1272 out:
1273 	return err;
1274 }
1275 
sock_queue_err_skb(struct sock * sk,struct sk_buff * skb)1276 static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb)
1277 {
1278 	/* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
1279 	   number of warnings when compiling with -W --ANK
1280 	 */
1281 	if (atomic_read(&sk->rmem_alloc) + skb->truesize >= (unsigned)sk->rcvbuf)
1282 		return -ENOMEM;
1283 	skb_set_owner_r(skb, sk);
1284 	skb_queue_tail(&sk->error_queue,skb);
1285 	if (!sk->dead)
1286 		sk->data_ready(sk,skb->len);
1287 	return 0;
1288 }
1289 
1290 /*
1291  *	Recover an error report and clear atomically
1292  */
1293 
sock_error(struct sock * sk)1294 static inline int sock_error(struct sock *sk)
1295 {
1296 	int err=xchg(&sk->err,0);
1297 	return -err;
1298 }
1299 
sock_wspace(struct sock * sk)1300 static inline unsigned long sock_wspace(struct sock *sk)
1301 {
1302 	int amt = 0;
1303 
1304 	if (!(sk->shutdown & SEND_SHUTDOWN)) {
1305 		amt = sk->sndbuf - atomic_read(&sk->wmem_alloc);
1306 		if (amt < 0)
1307 			amt = 0;
1308 	}
1309 	return amt;
1310 }
1311 
sk_wake_async(struct sock * sk,int how,int band)1312 static inline void sk_wake_async(struct sock *sk, int how, int band)
1313 {
1314 	if (sk->socket && sk->socket->fasync_list)
1315 		sock_wake_async(sk->socket, how, band);
1316 }
1317 
1318 #define SOCK_MIN_SNDBUF 2048
1319 #define SOCK_MIN_RCVBUF 256
1320 
1321 /*
1322  *	Default write policy as shown to user space via poll/select/SIGIO
1323  */
sock_writeable(struct sock * sk)1324 static inline int sock_writeable(struct sock *sk)
1325 {
1326 	return atomic_read(&sk->wmem_alloc) < (sk->sndbuf / 2);
1327 }
1328 
gfp_any(void)1329 static inline int gfp_any(void)
1330 {
1331 	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
1332 }
1333 
sock_rcvtimeo(struct sock * sk,int noblock)1334 static inline long sock_rcvtimeo(struct sock *sk, int noblock)
1335 {
1336 	return noblock ? 0 : sk->rcvtimeo;
1337 }
1338 
sock_sndtimeo(struct sock * sk,int noblock)1339 static inline long sock_sndtimeo(struct sock *sk, int noblock)
1340 {
1341 	return noblock ? 0 : sk->sndtimeo;
1342 }
1343 
sock_rcvlowat(struct sock * sk,int waitall,int len)1344 static inline int sock_rcvlowat(struct sock *sk, int waitall, int len)
1345 {
1346 	return (waitall ? len : min_t(int, sk->rcvlowat, len)) ? : 1;
1347 }
1348 
1349 /* Alas, with timeout socket operations are not restartable.
1350  * Compare this to poll().
1351  */
sock_intr_errno(long timeo)1352 static inline int sock_intr_errno(long timeo)
1353 {
1354 	return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR;
1355 }
1356 
1357 static __inline__ void
sock_recv_timestamp(struct msghdr * msg,struct sock * sk,struct sk_buff * skb)1358 sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb)
1359 {
1360 	if (sk->rcvtstamp)
1361 		put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(skb->stamp), &skb->stamp);
1362 	else
1363 		sk->stamp = skb->stamp;
1364 }
1365 
1366 /*
1367  *	Enable debug/info messages
1368  */
1369 
1370 #if 0
1371 #define NETDEBUG(x)	do { } while (0)
1372 #else
1373 #define NETDEBUG(x)	do { x; } while (0)
1374 #endif
1375 
1376 /*
1377  * Macros for sleeping on a socket. Use them like this:
1378  *
1379  * SOCK_SLEEP_PRE(sk)
1380  * if (condition)
1381  * 	schedule();
1382  * SOCK_SLEEP_POST(sk)
1383  *
1384  */
1385 
1386 #define SOCK_SLEEP_PRE(sk) 	{ struct task_struct *tsk = current; \
1387 				DECLARE_WAITQUEUE(wait, tsk); \
1388 				tsk->state = TASK_INTERRUPTIBLE; \
1389 				add_wait_queue((sk)->sleep, &wait); \
1390 				release_sock(sk);
1391 
1392 #define SOCK_SLEEP_POST(sk)	tsk->state = TASK_RUNNING; \
1393 				remove_wait_queue((sk)->sleep, &wait); \
1394 				lock_sock(sk); \
1395 				}
1396 
1397 extern __u32 sysctl_wmem_max;
1398 extern __u32 sysctl_rmem_max;
1399 
1400 #endif	/* _SOCK_H */
1401