1 /*
2 * SUCS NET3:
3 *
4 * Generic datagram handling routines. These are generic for all protocols. Possibly a generic IP version on top
5 * of these would make sense. Not tonight however 8-).
6 * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and NetROM layer all have identical poll code and mostly
7 * identical recvmsg() code. So we share it here. The poll was shared before but buried in udp.c so I moved it.
8 *
9 * Authors: Alan Cox <alan@redhat.com>. (datagram_poll() from old udp.c code)
10 *
11 * Fixes:
12 * Alan Cox : NULL return from skb_peek_copy() understood
13 * Alan Cox : Rewrote skb_read_datagram to avoid the skb_peek_copy stuff.
14 * Alan Cox : Added support for SOCK_SEQPACKET. IPX can no longer use the SO_TYPE hack but
15 * AX.25 now works right, and SPX is feasible.
16 * Alan Cox : Fixed write poll of non IP protocol crash.
17 * Florian La Roche: Changed for my new skbuff handling.
18 * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET.
19 * Linus Torvalds : BSD semantic fixes.
20 * Alan Cox : Datagram iovec handling
21 * Darryl Miles : Fixed non-blocking SOCK_STREAM.
22 * Alan Cox : POSIXisms
23 * Pete Wyckoff : Unconnected accept() fix.
24 *
25 */
26
27 #include <linux/types.h>
28 #include <linux/kernel.h>
29 #include <asm/uaccess.h>
30 #include <asm/system.h>
31 #include <linux/mm.h>
32 #include <linux/interrupt.h>
33 #include <linux/errno.h>
34 #include <linux/sched.h>
35 #include <linux/inet.h>
36 #include <linux/netdevice.h>
37 #include <linux/rtnetlink.h>
38 #include <linux/poll.h>
39 #include <linux/highmem.h>
40
41 #include <net/protocol.h>
42 #include <linux/skbuff.h>
43 #include <net/sock.h>
44 #include <net/checksum.h>
45
46
47 /*
48 * Is a socket 'connection oriented' ?
49 */
50
connection_based(struct sock * sk)51 static inline int connection_based(struct sock *sk)
52 {
53 return (sk->type==SOCK_SEQPACKET || sk->type==SOCK_STREAM);
54 }
55
56
57 /*
58 * Wait for a packet..
59 */
60
wait_for_packet(struct sock * sk,int * err,long * timeo_p)61 static int wait_for_packet(struct sock * sk, int *err, long *timeo_p)
62 {
63 int error;
64
65 DECLARE_WAITQUEUE(wait, current);
66
67 __set_current_state(TASK_INTERRUPTIBLE);
68 add_wait_queue_exclusive(sk->sleep, &wait);
69
70 /* Socket errors? */
71 error = sock_error(sk);
72 if (error)
73 goto out_err;
74
75 if (!skb_queue_empty(&sk->receive_queue))
76 goto ready;
77
78 /* Socket shut down? */
79 if (sk->shutdown & RCV_SHUTDOWN)
80 goto out_noerr;
81
82 /* Sequenced packets can come disconnected. If so we report the problem */
83 error = -ENOTCONN;
84 if(connection_based(sk) && !(sk->state==TCP_ESTABLISHED || sk->state==TCP_LISTEN))
85 goto out_err;
86
87 /* handle signals */
88 if (signal_pending(current))
89 goto interrupted;
90
91 *timeo_p = schedule_timeout(*timeo_p);
92
93 ready:
94 current->state = TASK_RUNNING;
95 remove_wait_queue(sk->sleep, &wait);
96 return 0;
97
98 interrupted:
99 error = sock_intr_errno(*timeo_p);
100 out_err:
101 *err = error;
102 out:
103 current->state = TASK_RUNNING;
104 remove_wait_queue(sk->sleep, &wait);
105 return error;
106 out_noerr:
107 *err = 0;
108 error = 1;
109 goto out;
110 }
111
112 /*
113 * Get a datagram skbuff, understands the peeking, nonblocking wakeups and possible
114 * races. This replaces identical code in packet,raw and udp, as well as the IPX
115 * AX.25 and Appletalk. It also finally fixes the long standing peek and read
116 * race for datagram sockets. If you alter this routine remember it must be
117 * re-entrant.
118 *
119 * This function will lock the socket if a skb is returned, so the caller
120 * needs to unlock the socket in that case (usually by calling skb_free_datagram)
121 *
122 * * It does not lock socket since today. This function is
123 * * free of race conditions. This measure should/can improve
124 * * significantly datagram socket latencies at high loads,
125 * * when data copying to user space takes lots of time.
126 * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet
127 * * 8) Great win.)
128 * * --ANK (980729)
129 *
130 * The order of the tests when we find no data waiting are specified
131 * quite explicitly by POSIX 1003.1g, don't change them without having
132 * the standard around please.
133 */
134
skb_recv_datagram(struct sock * sk,unsigned flags,int noblock,int * err)135 struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, int noblock, int *err)
136 {
137 int error;
138 struct sk_buff *skb;
139 long timeo;
140
141 /* Caller is allowed not to check sk->err before skb_recv_datagram() */
142 error = sock_error(sk);
143 if (error)
144 goto no_packet;
145
146 timeo = sock_rcvtimeo(sk, noblock);
147
148 do {
149 /* Again only user level code calls this function, so nothing interrupt level
150 will suddenly eat the receive_queue.
151
152 Look at current nfs client by the way...
153 However, this function was corrent in any case. 8)
154 */
155 if (flags & MSG_PEEK)
156 {
157 unsigned long cpu_flags;
158
159 spin_lock_irqsave(&sk->receive_queue.lock, cpu_flags);
160 skb = skb_peek(&sk->receive_queue);
161 if(skb!=NULL)
162 atomic_inc(&skb->users);
163 spin_unlock_irqrestore(&sk->receive_queue.lock, cpu_flags);
164 } else
165 skb = skb_dequeue(&sk->receive_queue);
166
167 if (skb)
168 return skb;
169
170 /* User doesn't want to wait */
171 error = -EAGAIN;
172 if (!timeo)
173 goto no_packet;
174
175 } while (wait_for_packet(sk, err, &timeo) == 0);
176
177 return NULL;
178
179 no_packet:
180 *err = error;
181 return NULL;
182 }
183
skb_free_datagram(struct sock * sk,struct sk_buff * skb)184 void skb_free_datagram(struct sock * sk, struct sk_buff *skb)
185 {
186 kfree_skb(skb);
187 }
188
189 /*
190 * Copy a datagram to a linear buffer.
191 */
192
skb_copy_datagram(const struct sk_buff * skb,int offset,char * to,int size)193 int skb_copy_datagram(const struct sk_buff *skb, int offset, char *to, int size)
194 {
195 struct iovec iov = { to, size };
196
197 return skb_copy_datagram_iovec(skb, offset, &iov, size);
198 }
199
200 /*
201 * Copy a datagram to an iovec.
202 * Note: the iovec is modified during the copy.
203 */
skb_copy_datagram_iovec(const struct sk_buff * skb,int offset,struct iovec * to,int len)204 int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, struct iovec *to,
205 int len)
206 {
207 int i, copy;
208 int start = skb->len - skb->data_len;
209
210 /* Copy header. */
211 if ((copy = start-offset) > 0) {
212 if (copy > len)
213 copy = len;
214 if (memcpy_toiovec(to, skb->data + offset, copy))
215 goto fault;
216 if ((len -= copy) == 0)
217 return 0;
218 offset += copy;
219 }
220
221 /* Copy paged appendix. Hmm... why does this look so complicated? */
222 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
223 int end;
224
225 BUG_TRAP(start <= offset+len);
226
227 end = start + skb_shinfo(skb)->frags[i].size;
228 if ((copy = end-offset) > 0) {
229 int err;
230 u8 *vaddr;
231 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
232 struct page *page = frag->page;
233
234 if (copy > len)
235 copy = len;
236 vaddr = kmap(page);
237 err = memcpy_toiovec(to, vaddr + frag->page_offset +
238 offset-start, copy);
239 kunmap(page);
240 if (err)
241 goto fault;
242 if (!(len -= copy))
243 return 0;
244 offset += copy;
245 }
246 start = end;
247 }
248
249 if (skb_shinfo(skb)->frag_list) {
250 struct sk_buff *list;
251
252 for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
253 int end;
254
255 BUG_TRAP(start <= offset+len);
256
257 end = start + list->len;
258 if ((copy = end-offset) > 0) {
259 if (copy > len)
260 copy = len;
261 if (skb_copy_datagram_iovec(list, offset-start, to, copy))
262 goto fault;
263 if ((len -= copy) == 0)
264 return 0;
265 offset += copy;
266 }
267 start = end;
268 }
269 }
270 if (len == 0)
271 return 0;
272
273 fault:
274 return -EFAULT;
275 }
276
skb_copy_and_csum_datagram(const struct sk_buff * skb,int offset,u8 * to,int len,unsigned int * csump)277 int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, u8 *to, int len, unsigned int *csump)
278 {
279 int i, copy;
280 int start = skb->len - skb->data_len;
281 int pos = 0;
282
283 /* Copy header. */
284 if ((copy = start-offset) > 0) {
285 int err = 0;
286 if (copy > len)
287 copy = len;
288 *csump = csum_and_copy_to_user(skb->data+offset, to, copy, *csump, &err);
289 if (err)
290 goto fault;
291 if ((len -= copy) == 0)
292 return 0;
293 offset += copy;
294 to += copy;
295 pos = copy;
296 }
297
298 for (i=0; i<skb_shinfo(skb)->nr_frags; i++) {
299 int end;
300
301 BUG_TRAP(start <= offset+len);
302
303 end = start + skb_shinfo(skb)->frags[i].size;
304 if ((copy = end-offset) > 0) {
305 unsigned int csum2;
306 int err = 0;
307 u8 *vaddr;
308 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
309 struct page *page = frag->page;
310
311 if (copy > len)
312 copy = len;
313 vaddr = kmap(page);
314 csum2 = csum_and_copy_to_user(vaddr + frag->page_offset +
315 offset-start, to, copy, 0, &err);
316 kunmap(page);
317 if (err)
318 goto fault;
319 *csump = csum_block_add(*csump, csum2, pos);
320 if (!(len -= copy))
321 return 0;
322 offset += copy;
323 to += copy;
324 pos += copy;
325 }
326 start = end;
327 }
328
329 if (skb_shinfo(skb)->frag_list) {
330 struct sk_buff *list;
331
332 for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
333 int end;
334
335 BUG_TRAP(start <= offset+len);
336
337 end = start + list->len;
338 if ((copy = end-offset) > 0) {
339 unsigned int csum2 = 0;
340 if (copy > len)
341 copy = len;
342 if (skb_copy_and_csum_datagram(list, offset-start, to, copy, &csum2))
343 goto fault;
344 *csump = csum_block_add(*csump, csum2, pos);
345 if ((len -= copy) == 0)
346 return 0;
347 offset += copy;
348 to += copy;
349 pos += copy;
350 }
351 start = end;
352 }
353 }
354 if (len == 0)
355 return 0;
356
357 fault:
358 return -EFAULT;
359 }
360
361 /* Copy and checkum skb to user iovec. Caller _must_ check that
362 skb will fit to this iovec.
363
364 Returns: 0 - success.
365 -EINVAL - checksum failure.
366 -EFAULT - fault during copy. Beware, in this case iovec can be
367 modified!
368 */
369
skb_copy_and_csum_datagram_iovec(const struct sk_buff * skb,int hlen,struct iovec * iov)370 int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, int hlen, struct iovec *iov)
371 {
372 unsigned int csum;
373 int chunk = skb->len - hlen;
374
375 /* Skip filled elements. Pretty silly, look at memcpy_toiovec, though 8) */
376 while (iov->iov_len == 0)
377 iov++;
378
379 if (iov->iov_len < chunk) {
380 if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk+hlen, skb->csum)))
381 goto csum_error;
382 if (skb_copy_datagram_iovec(skb, hlen, iov, chunk))
383 goto fault;
384 } else {
385 csum = csum_partial(skb->data, hlen, skb->csum);
386 if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, chunk, &csum))
387 goto fault;
388 if ((unsigned short)csum_fold(csum))
389 goto csum_error;
390 iov->iov_len -= chunk;
391 iov->iov_base += chunk;
392 }
393 return 0;
394
395 csum_error:
396 return -EINVAL;
397
398 fault:
399 return -EFAULT;
400 }
401
402
403
404 /*
405 * Datagram poll: Again totally generic. This also handles
406 * sequenced packet sockets providing the socket receive queue
407 * is only ever holding data ready to receive.
408 *
409 * Note: when you _don't_ use this routine for this protocol,
410 * and you use a different write policy from sock_writeable()
411 * then please supply your own write_space callback.
412 */
413
datagram_poll(struct file * file,struct socket * sock,poll_table * wait)414 unsigned int datagram_poll(struct file * file, struct socket *sock, poll_table *wait)
415 {
416 struct sock *sk = sock->sk;
417 unsigned int mask;
418
419 poll_wait(file, sk->sleep, wait);
420 mask = 0;
421
422 /* exceptional events? */
423 if (sk->err || !skb_queue_empty(&sk->error_queue))
424 mask |= POLLERR;
425 if (sk->shutdown == SHUTDOWN_MASK)
426 mask |= POLLHUP;
427
428 /* readable? */
429 if (!skb_queue_empty(&sk->receive_queue) || (sk->shutdown&RCV_SHUTDOWN))
430 mask |= POLLIN | POLLRDNORM;
431
432 /* Connection-based need to check for termination and startup */
433 if (connection_based(sk)) {
434 if (sk->state==TCP_CLOSE)
435 mask |= POLLHUP;
436 /* connection hasn't started yet? */
437 if (sk->state == TCP_SYN_SENT)
438 return mask;
439 }
440
441 /* writable? */
442 if (sock_writeable(sk))
443 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
444 else
445 set_bit(SOCK_ASYNC_NOSPACE, &sk->socket->flags);
446
447 return mask;
448 }
449