1 /*
2 * Network block device - make block devices work over TCP
3 *
4 * Note that you can not swap over this thing, yet. Seems to work but
5 * deadlocks sometimes - you can not swap over TCP in general.
6 *
7 * Copyright 1997-2000 Pavel Machek <pavel@ucw.cz>
8 * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
9 *
10 * (part of code stolen from loop.c)
11 *
12 * 97-3-25 compiled 0-th version, not yet tested it
13 * (it did not work, BTW) (later that day) HEY! it works!
14 * (bit later) hmm, not that much... 2:00am next day:
15 * yes, it works, but it gives something like 50kB/sec
16 * 97-4-01 complete rewrite to make it possible for many requests at
17 * once to be processed
18 * 97-4-11 Making protocol independent of endianity etc.
19 * 97-9-13 Cosmetic changes
20 * 98-5-13 Attempt to make 64-bit-clean on 64-bit machines
21 * 99-1-11 Attempt to make 64-bit-clean on 32-bit machines <ankry@mif.pg.gda.pl>
22 * 01-2-27 Fix to store proper blockcount for kernel (calculated using
23 * BLOCK_SIZE_BITS, not device blocksize) <aga@permonline.ru>
24 * 01-3-11 Make nbd work with new Linux block layer code. It now supports
25 * plugging like all the other block devices. Also added in MSG_MORE to
26 * reduce number of partial TCP segments sent. <steve@chygwyn.com>
27 * 01-12-6 Fix deadlock condition by making queue locks independant of
28 * the transmit lock. <steve@chygwyn.com>
29 * 02-10-11 Allow hung xmit to be aborted via SIGKILL & various fixes.
30 * <Paul.Clements@SteelEye.com> <James.Bottomley@SteelEye.com>
31 *
32 * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall
33 * why not: would need verify_area and friends, would share yet another
34 * structure with userland
35 */
36
37 #define PARANOIA
38 #include <linux/major.h>
39
40 #include <linux/module.h>
41 #include <linux/init.h>
42 #include <linux/sched.h>
43 #include <linux/fs.h>
44 #include <linux/stat.h>
45 #include <linux/errno.h>
46 #include <linux/file.h>
47 #include <linux/ioctl.h>
48 #include <net/sock.h>
49
50 #include <linux/devfs_fs_kernel.h>
51
52 #include <asm/uaccess.h>
53 #include <asm/types.h>
54
55 #define MAJOR_NR NBD_MAJOR
56 #include <linux/nbd.h>
57
58 #define LO_MAGIC 0x68797548
59
60 static int nbd_blksizes[MAX_NBD];
61 static int nbd_blksize_bits[MAX_NBD];
62 static int nbd_sizes[MAX_NBD];
63 static u64 nbd_bytesizes[MAX_NBD];
64
65 static struct nbd_device nbd_dev[MAX_NBD];
66 static devfs_handle_t devfs_handle;
67
68 #define DEBUG( s )
69 /* #define DEBUG( s ) printk( s )
70 */
71
72 #ifdef PARANOIA
73 static int requests_in;
74 static int requests_out;
75 #endif
76
77 static void
nbd_end_request(struct request * req)78 nbd_end_request(struct request *req)
79 {
80 struct buffer_head *bh;
81 unsigned nsect;
82 unsigned long flags;
83 int uptodate = (req->errors == 0) ? 1 : 0;
84
85 #ifdef PARANOIA
86 requests_out++;
87 #endif
88 spin_lock_irqsave(&io_request_lock, flags);
89 while((bh = req->bh) != NULL) {
90 nsect = bh->b_size >> 9;
91 blk_finished_io(nsect);
92 req->bh = bh->b_reqnext;
93 bh->b_reqnext = NULL;
94 bh->b_end_io(bh, uptodate);
95 }
96 blkdev_release_request(req);
97 spin_unlock_irqrestore(&io_request_lock, flags);
98 }
99
nbd_open(struct inode * inode,struct file * file)100 static int nbd_open(struct inode *inode, struct file *file)
101 {
102 int dev;
103
104 if (!inode)
105 return -EINVAL;
106 dev = MINOR(inode->i_rdev);
107 if (dev >= MAX_NBD)
108 return -ENODEV;
109
110 nbd_dev[dev].refcnt++;
111 return 0;
112 }
113
114 /*
115 * Send or receive packet.
116 */
nbd_xmit(int send,struct socket * sock,char * buf,int size,int msg_flags)117 static int nbd_xmit(int send, struct socket *sock, char *buf, int size, int msg_flags)
118 {
119 mm_segment_t oldfs;
120 int result;
121 struct msghdr msg;
122 struct iovec iov;
123 unsigned long flags;
124 sigset_t oldset;
125
126 oldfs = get_fs();
127 set_fs(get_ds());
128
129 /* Allow interception of SIGKILL only
130 * Don't allow other signals to interrupt the transmission */
131 spin_lock_irqsave(¤t->sigmask_lock, flags);
132 oldset = current->blocked;
133 sigfillset(¤t->blocked);
134 sigdelsetmask(¤t->blocked, sigmask(SIGKILL));
135 recalc_sigpending(current);
136 spin_unlock_irqrestore(¤t->sigmask_lock, flags);
137
138
139 do {
140 sock->sk->allocation = GFP_NOIO;
141 iov.iov_base = buf;
142 iov.iov_len = size;
143 msg.msg_name = NULL;
144 msg.msg_namelen = 0;
145 msg.msg_iov = &iov;
146 msg.msg_iovlen = 1;
147 msg.msg_control = NULL;
148 msg.msg_controllen = 0;
149 msg.msg_namelen = 0;
150 msg.msg_flags = msg_flags | MSG_NOSIGNAL;
151
152 if (send)
153 result = sock_sendmsg(sock, &msg, size);
154 else
155 result = sock_recvmsg(sock, &msg, size, 0);
156
157 if (signal_pending(current)) {
158 siginfo_t info;
159 spin_lock_irqsave(¤t->sigmask_lock, flags);
160 printk(KERN_WARNING "NBD (pid %d: %s) got signal %d\n",
161 current->pid, current->comm,
162 dequeue_signal(¤t->blocked, &info));
163 spin_unlock_irqrestore(¤t->sigmask_lock, flags);
164 result = -EINTR;
165 break;
166 }
167
168 if (result <= 0) {
169 #ifdef PARANOIA
170 printk(KERN_ERR "NBD: %s - sock=%ld at buf=%ld, size=%d returned %d.\n",
171 send ? "send" : "receive", (long) sock, (long) buf, size, result);
172 #endif
173 break;
174 }
175 size -= result;
176 buf += result;
177 } while (size > 0);
178
179 spin_lock_irqsave(¤t->sigmask_lock, flags);
180 current->blocked = oldset;
181 recalc_sigpending(current);
182 spin_unlock_irqrestore(¤t->sigmask_lock, flags);
183
184 set_fs(oldfs);
185 return result;
186 }
187
188 #define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; }
189
nbd_send_req(struct nbd_device * lo,struct request * req)190 void nbd_send_req(struct nbd_device *lo, struct request *req)
191 {
192 int result = -1;
193 struct nbd_request request;
194 unsigned long size = req->nr_sectors << 9;
195 struct socket *sock = lo->sock;
196
197 DEBUG("NBD: sending control, ");
198 request.magic = htonl(NBD_REQUEST_MAGIC);
199 request.type = htonl(req->cmd);
200 request.from = cpu_to_be64( (u64) req->sector << 9);
201 request.len = htonl(size);
202 memcpy(request.handle, &req, sizeof(req));
203
204 down(&lo->tx_lock);
205
206 if (!sock || !lo->sock) {
207 FAIL("Attempted sendmsg to closed socket\n");
208 }
209
210 result = nbd_xmit(1, sock, (char *) &request, sizeof(request), req->cmd == WRITE ? MSG_MORE : 0);
211 if (result <= 0)
212 FAIL("Sendmsg failed for control.");
213
214 if (req->cmd == WRITE) {
215 struct buffer_head *bh = req->bh;
216 DEBUG("data, ");
217 do {
218 result = nbd_xmit(1, sock, bh->b_data, bh->b_size, bh->b_reqnext == NULL ? 0 : MSG_MORE);
219 if (result <= 0)
220 FAIL("Send data failed.");
221 bh = bh->b_reqnext;
222 } while(bh);
223 }
224 up(&lo->tx_lock);
225 return;
226
227 error_out:
228 up(&lo->tx_lock);
229 req->errors++;
230 }
231
nbd_find_request(struct nbd_device * lo,char * handle)232 static struct request *nbd_find_request(struct nbd_device *lo, char *handle)
233 {
234 struct request *req;
235 struct list_head *tmp;
236 struct request *xreq;
237
238 memcpy(&xreq, handle, sizeof(xreq));
239
240 spin_lock(&lo->queue_lock);
241 list_for_each(tmp, &lo->queue_head) {
242 req = list_entry(tmp, struct request, queue);
243 if (req != xreq)
244 continue;
245 list_del(&req->queue);
246 spin_unlock(&lo->queue_lock);
247 return req;
248 }
249 spin_unlock(&lo->queue_lock);
250 return NULL;
251 }
252
253 #define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; }
nbd_read_stat(struct nbd_device * lo)254 struct request *nbd_read_stat(struct nbd_device *lo)
255 /* NULL returned = something went wrong, inform userspace */
256 {
257 int result;
258 struct nbd_reply reply;
259 struct request *req;
260
261 DEBUG("reading control, ");
262 reply.magic = 0;
263 result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply), MSG_WAITALL);
264 if (result <= 0)
265 HARDFAIL("Recv control failed.");
266 req = nbd_find_request(lo, reply.handle);
267 if (req == NULL)
268 HARDFAIL("Unexpected reply");
269
270 DEBUG("ok, ");
271 if (ntohl(reply.magic) != NBD_REPLY_MAGIC)
272 HARDFAIL("Not enough magic.");
273 if (ntohl(reply.error))
274 FAIL("Other side returned error.");
275 if (req->cmd == READ) {
276 struct buffer_head *bh = req->bh;
277 DEBUG("data, ");
278 do {
279 result = nbd_xmit(0, lo->sock, bh->b_data, bh->b_size, MSG_WAITALL);
280 if (result <= 0)
281 HARDFAIL("Recv data failed.");
282 bh = bh->b_reqnext;
283 } while(bh);
284 }
285 DEBUG("done.\n");
286 return req;
287
288 /* Can we get here? Yes, if other side returns error */
289 error_out:
290 req->errors++;
291 return req;
292 }
293
nbd_do_it(struct nbd_device * lo)294 void nbd_do_it(struct nbd_device *lo)
295 {
296 struct request *req;
297
298 while (1) {
299 req = nbd_read_stat(lo);
300
301 if (!req) {
302 printk(KERN_ALERT "req should never be null\n" );
303 goto out;
304 }
305 #ifdef PARANOIA
306 if (lo != &nbd_dev[MINOR(req->rq_dev)]) {
307 printk(KERN_ALERT "NBD: request corrupted!\n");
308 continue;
309 }
310 if (lo->magic != LO_MAGIC) {
311 printk(KERN_ALERT "NBD: nbd_dev[] corrupted: Not enough magic\n");
312 goto out;
313 }
314 #endif
315
316 nbd_end_request(req);
317
318 }
319 out:
320 return;
321 }
322
nbd_clear_que(struct nbd_device * lo)323 void nbd_clear_que(struct nbd_device *lo)
324 {
325 struct request *req;
326
327 #ifdef PARANOIA
328 if (lo->magic != LO_MAGIC) {
329 printk(KERN_ERR "NBD: nbd_dev[] corrupted: Not enough magic when clearing!\n");
330 return;
331 }
332 #endif
333 do {
334 req = NULL;
335 spin_lock(&lo->queue_lock);
336 if (!list_empty(&lo->queue_head)) {
337 req = list_entry(lo->queue_head.next, struct request, queue);
338 list_del(&req->queue);
339 }
340 spin_unlock(&lo->queue_lock);
341 if (req) {
342 req->errors++;
343 nbd_end_request(req);
344 }
345 } while(req);
346
347 }
348
349 /*
350 * We always wait for result of write, for now. It would be nice to make it optional
351 * in future
352 * if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
353 * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
354 */
355
356 #undef FAIL
357 #define FAIL( s ) { printk( KERN_ERR "NBD, minor %d: " s "\n", dev ); goto error_out; }
358
do_nbd_request(request_queue_t * q)359 static void do_nbd_request(request_queue_t * q)
360 {
361 struct request *req;
362 int dev = 0;
363 struct nbd_device *lo;
364
365 while (!QUEUE_EMPTY) {
366 req = CURRENT;
367 #ifdef PARANOIA
368 if (!req)
369 FAIL("que not empty but no request?");
370 #endif
371 dev = MINOR(req->rq_dev);
372 #ifdef PARANOIA
373 if (dev >= MAX_NBD)
374 FAIL("Minor too big."); /* Probably can not happen */
375 #endif
376 lo = &nbd_dev[dev];
377 if (!lo->file)
378 FAIL("Request when not-ready.");
379 if ((req->cmd == WRITE) && (lo->flags & NBD_READ_ONLY))
380 FAIL("Write on read-only");
381 #ifdef PARANOIA
382 if (lo->magic != LO_MAGIC)
383 FAIL("nbd[] is not magical!");
384 requests_in++;
385 #endif
386 req->errors = 0;
387 blkdev_dequeue_request(req);
388 spin_unlock_irq(&io_request_lock);
389
390 spin_lock(&lo->queue_lock);
391 if (!lo->file) {
392 spin_unlock(&lo->queue_lock);
393 printk(KERN_ERR "nbd: failed between accept and semaphore, file lost\n");
394 req->errors++;
395 nbd_end_request(req);
396 spin_lock_irq(&io_request_lock);
397 continue;
398 }
399
400 list_add_tail(&req->queue, &lo->queue_head);
401 spin_unlock(&lo->queue_lock);
402
403 nbd_send_req(lo, req);
404 if (req->errors) {
405 printk(KERN_ERR "nbd: nbd_send_req failed\n");
406 spin_lock(&lo->queue_lock);
407 list_del(&req->queue);
408 spin_unlock(&lo->queue_lock);
409 nbd_end_request(req);
410 spin_lock_irq(&io_request_lock);
411 continue;
412 }
413
414 spin_lock_irq(&io_request_lock);
415 continue;
416
417 error_out:
418 req->errors++;
419 blkdev_dequeue_request(req);
420 spin_unlock(&io_request_lock);
421 nbd_end_request(req);
422 spin_lock(&io_request_lock);
423 }
424 return;
425 }
426
nbd_ioctl(struct inode * inode,struct file * file,unsigned int cmd,unsigned long arg)427 static int nbd_ioctl(struct inode *inode, struct file *file,
428 unsigned int cmd, unsigned long arg)
429 {
430 struct nbd_device *lo;
431 int dev, error, temp;
432 struct request sreq ;
433
434
435 if (!inode)
436 return -EINVAL;
437 dev = MINOR(inode->i_rdev);
438 if (dev >= MAX_NBD)
439 return -ENODEV;
440
441 lo = &nbd_dev[dev];
442
443 /* these are innocent, but.... */
444 switch (cmd) {
445 case BLKGETSIZE:
446 return put_user(nbd_bytesizes[dev] >> 9, (unsigned long *) arg);
447 case BLKGETSIZE64:
448 return put_user((u64)nbd_bytesizes[dev], (u64 *) arg);
449 }
450
451 /* ... anyone capable of any of the below ioctls can do *real bad*
452 things */
453 if (!capable(CAP_SYS_ADMIN))
454 return -EPERM;
455
456 switch (cmd) {
457 case NBD_DISCONNECT:
458 printk("NBD_DISCONNECT\n");
459 sreq.cmd=2 ; /* shutdown command */
460 if (!lo->sock) return -EINVAL;
461 nbd_send_req(lo, &sreq);
462 return 0 ;
463
464 case NBD_CLEAR_SOCK:
465 error = 0;
466 down(&lo->tx_lock);
467 lo->sock = NULL;
468 up(&lo->tx_lock);
469 spin_lock(&lo->queue_lock);
470 file = lo->file;
471 lo->file = NULL;
472 spin_unlock(&lo->queue_lock);
473 nbd_clear_que(lo);
474 spin_lock(&lo->queue_lock);
475 if (!list_empty(&lo->queue_head)) {
476 printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n");
477 error = -EBUSY;
478 }
479 spin_unlock(&lo->queue_lock);
480 if (file)
481 fput(file);
482 return error;
483 case NBD_SET_SOCK:
484 if (lo->file)
485 return -EBUSY;
486 error = -EINVAL;
487 file = fget(arg);
488 if (file) {
489 inode = file->f_dentry->d_inode;
490 /* N.B. Should verify that it's a socket */
491 lo->file = file;
492 lo->sock = &inode->u.socket_i;
493 error = 0;
494 }
495 return error;
496 case NBD_SET_BLKSIZE:
497 if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE))
498 return -EINVAL;
499 nbd_blksizes[dev] = arg;
500 temp = arg >> 9;
501 nbd_blksize_bits[dev] = 9;
502 while (temp > 1) {
503 nbd_blksize_bits[dev]++;
504 temp >>= 1;
505 }
506 nbd_bytesizes[dev] &= ~(nbd_blksizes[dev]-1);
507 nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
508 return 0;
509 case NBD_SET_SIZE:
510 nbd_bytesizes[dev] = arg & ~(nbd_blksizes[dev]-1);
511 nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
512 return 0;
513 case NBD_SET_SIZE_BLOCKS:
514 nbd_bytesizes[dev] = ((u64) arg) << nbd_blksize_bits[dev];
515 nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS;
516 return 0;
517 case NBD_DO_IT:
518 if (!lo->file)
519 return -EINVAL;
520 nbd_do_it(lo);
521 /* on return tidy up in case we have a signal */
522 /* Forcibly shutdown the socket causing all listeners
523 * to error
524 *
525 * FIXME: This code is duplicated from sys_shutdown, but
526 * there should be a more generic interface rather than
527 * calling socket ops directly here */
528 down(&lo->tx_lock);
529 if (lo->sock) {
530 printk(KERN_WARNING "nbd: shutting down socket\n");
531 lo->sock->ops->shutdown(lo->sock,
532 SEND_SHUTDOWN|RCV_SHUTDOWN);
533 lo->sock = NULL;
534 }
535 up(&lo->tx_lock);
536 spin_lock(&lo->queue_lock);
537 file = lo->file;
538 lo->file = NULL;
539 spin_unlock(&lo->queue_lock);
540 nbd_clear_que(lo);
541 printk(KERN_WARNING "nbd: queue cleared\n");
542 if (file)
543 fput(file);
544 return lo->harderror;
545 case NBD_CLEAR_QUE:
546 down(&lo->tx_lock);
547 if (lo->sock) {
548 up(&lo->tx_lock);
549 return 0; /* probably should be error, but that would
550 * break "nbd-client -d", so just return 0 */
551 }
552 up(&lo->tx_lock);
553 nbd_clear_que(lo);
554 return 0;
555 #ifdef PARANOIA
556 case NBD_PRINT_DEBUG:
557 printk(KERN_INFO "NBD device %d: next = %p, prev = %p. Global: in %d, out %d\n",
558 dev, lo->queue_head.next, lo->queue_head.prev, requests_in, requests_out);
559 return 0;
560 #endif
561 }
562 return -EINVAL;
563 }
564
nbd_release(struct inode * inode,struct file * file)565 static int nbd_release(struct inode *inode, struct file *file)
566 {
567 struct nbd_device *lo;
568 int dev;
569
570 if (!inode)
571 return -ENODEV;
572 dev = MINOR(inode->i_rdev);
573 if (dev >= MAX_NBD)
574 return -ENODEV;
575 lo = &nbd_dev[dev];
576 if (lo->refcnt <= 0)
577 printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt);
578 lo->refcnt--;
579 /* N.B. Doesn't lo->file need an fput?? */
580 return 0;
581 }
582
583 static struct block_device_operations nbd_fops =
584 {
585 owner: THIS_MODULE,
586 open: nbd_open,
587 release: nbd_release,
588 ioctl: nbd_ioctl,
589 };
590
591 /*
592 * And here should be modules and kernel interface
593 * (Just smiley confuses emacs :-)
594 */
595
nbd_init(void)596 static int __init nbd_init(void)
597 {
598 int i;
599
600 if (sizeof(struct nbd_request) != 28) {
601 printk(KERN_CRIT "Sizeof nbd_request needs to be 28 in order to work!\n" );
602 return -EIO;
603 }
604
605 if (register_blkdev(MAJOR_NR, "nbd", &nbd_fops)) {
606 printk("Unable to get major number %d for NBD\n",
607 MAJOR_NR);
608 return -EIO;
609 }
610 #ifdef MODULE
611 printk("nbd: registered device at major %d\n", MAJOR_NR);
612 #endif
613 blksize_size[MAJOR_NR] = nbd_blksizes;
614 blk_size[MAJOR_NR] = nbd_sizes;
615 blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request);
616 blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0);
617 for (i = 0; i < MAX_NBD; i++) {
618 nbd_dev[i].refcnt = 0;
619 nbd_dev[i].file = NULL;
620 nbd_dev[i].magic = LO_MAGIC;
621 nbd_dev[i].flags = 0;
622 spin_lock_init(&nbd_dev[i].queue_lock);
623 INIT_LIST_HEAD(&nbd_dev[i].queue_head);
624 init_MUTEX(&nbd_dev[i].tx_lock);
625 nbd_blksizes[i] = 1024;
626 nbd_blksize_bits[i] = 10;
627 nbd_bytesizes[i] = ((u64)0x7ffffc00) << 10; /* 2TB */
628 nbd_sizes[i] = nbd_bytesizes[i] >> BLOCK_SIZE_BITS;
629 register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &nbd_fops,
630 nbd_bytesizes[i]>>9);
631 }
632 devfs_handle = devfs_mk_dir (NULL, "nbd", NULL);
633 devfs_register_series (devfs_handle, "%u", MAX_NBD,
634 DEVFS_FL_DEFAULT, MAJOR_NR, 0,
635 S_IFBLK | S_IRUSR | S_IWUSR,
636 &nbd_fops, NULL);
637
638 return 0;
639 }
640
nbd_cleanup(void)641 static void __exit nbd_cleanup(void)
642 {
643 devfs_unregister (devfs_handle);
644 blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR));
645
646 if (unregister_blkdev(MAJOR_NR, "nbd") != 0)
647 printk("nbd: cleanup_module failed\n");
648 else
649 printk("nbd: module cleaned up.\n");
650 }
651
652 module_init(nbd_init);
653 module_exit(nbd_cleanup);
654
655 MODULE_DESCRIPTION("Network Block Device");
656 MODULE_LICENSE("GPL");
657
658
659