/* * Network block device - make block devices work over TCP * * Note that you can not swap over this thing, yet. Seems to work but * deadlocks sometimes - you can not swap over TCP in general. * * Copyright 1997-2000 Pavel Machek * Parts copyright 2001 Steven Whitehouse * * (part of code stolen from loop.c) * * 97-3-25 compiled 0-th version, not yet tested it * (it did not work, BTW) (later that day) HEY! it works! * (bit later) hmm, not that much... 2:00am next day: * yes, it works, but it gives something like 50kB/sec * 97-4-01 complete rewrite to make it possible for many requests at * once to be processed * 97-4-11 Making protocol independent of endianity etc. * 97-9-13 Cosmetic changes * 98-5-13 Attempt to make 64-bit-clean on 64-bit machines * 99-1-11 Attempt to make 64-bit-clean on 32-bit machines * 01-2-27 Fix to store proper blockcount for kernel (calculated using * BLOCK_SIZE_BITS, not device blocksize) * 01-3-11 Make nbd work with new Linux block layer code. It now supports * plugging like all the other block devices. Also added in MSG_MORE to * reduce number of partial TCP segments sent. * 01-12-6 Fix deadlock condition by making queue locks independant of * the transmit lock. * 02-10-11 Allow hung xmit to be aborted via SIGKILL & various fixes. * * * possible FIXME: make set_sock / set_blksize / set_size / do_it one syscall * why not: would need verify_area and friends, would share yet another * structure with userland */ #define PARANOIA #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAJOR_NR NBD_MAJOR #include #define LO_MAGIC 0x68797548 static int nbd_blksizes[MAX_NBD]; static int nbd_blksize_bits[MAX_NBD]; static int nbd_sizes[MAX_NBD]; static u64 nbd_bytesizes[MAX_NBD]; static struct nbd_device nbd_dev[MAX_NBD]; static devfs_handle_t devfs_handle; #define DEBUG( s ) /* #define DEBUG( s ) printk( s ) */ #ifdef PARANOIA static int requests_in; static int requests_out; #endif static void nbd_end_request(struct request *req) { struct buffer_head *bh; unsigned nsect; unsigned long flags; int uptodate = (req->errors == 0) ? 1 : 0; #ifdef PARANOIA requests_out++; #endif spin_lock_irqsave(&io_request_lock, flags); while((bh = req->bh) != NULL) { nsect = bh->b_size >> 9; blk_finished_io(nsect); req->bh = bh->b_reqnext; bh->b_reqnext = NULL; bh->b_end_io(bh, uptodate); } blkdev_release_request(req); spin_unlock_irqrestore(&io_request_lock, flags); } static int nbd_open(struct inode *inode, struct file *file) { int dev; if (!inode) return -EINVAL; dev = MINOR(inode->i_rdev); if (dev >= MAX_NBD) return -ENODEV; nbd_dev[dev].refcnt++; return 0; } /* * Send or receive packet. */ static int nbd_xmit(int send, struct socket *sock, char *buf, int size, int msg_flags) { mm_segment_t oldfs; int result; struct msghdr msg; struct iovec iov; unsigned long flags; sigset_t oldset; oldfs = get_fs(); set_fs(get_ds()); /* Allow interception of SIGKILL only * Don't allow other signals to interrupt the transmission */ spin_lock_irqsave(¤t->sigmask_lock, flags); oldset = current->blocked; sigfillset(¤t->blocked); sigdelsetmask(¤t->blocked, sigmask(SIGKILL)); recalc_sigpending(current); spin_unlock_irqrestore(¤t->sigmask_lock, flags); do { sock->sk->allocation = GFP_NOIO; iov.iov_base = buf; iov.iov_len = size; msg.msg_name = NULL; msg.msg_namelen = 0; msg.msg_iov = &iov; msg.msg_iovlen = 1; msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_namelen = 0; msg.msg_flags = msg_flags | MSG_NOSIGNAL; if (send) result = sock_sendmsg(sock, &msg, size); else result = sock_recvmsg(sock, &msg, size, 0); if (signal_pending(current)) { siginfo_t info; spin_lock_irqsave(¤t->sigmask_lock, flags); printk(KERN_WARNING "NBD (pid %d: %s) got signal %d\n", current->pid, current->comm, dequeue_signal(¤t->blocked, &info)); spin_unlock_irqrestore(¤t->sigmask_lock, flags); result = -EINTR; break; } if (result <= 0) { #ifdef PARANOIA printk(KERN_ERR "NBD: %s - sock=%ld at buf=%ld, size=%d returned %d.\n", send ? "send" : "receive", (long) sock, (long) buf, size, result); #endif break; } size -= result; buf += result; } while (size > 0); spin_lock_irqsave(¤t->sigmask_lock, flags); current->blocked = oldset; recalc_sigpending(current); spin_unlock_irqrestore(¤t->sigmask_lock, flags); set_fs(oldfs); return result; } #define FAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); goto error_out; } void nbd_send_req(struct nbd_device *lo, struct request *req) { int result = -1; struct nbd_request request; unsigned long size = req->nr_sectors << 9; struct socket *sock = lo->sock; DEBUG("NBD: sending control, "); request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(req->cmd); request.from = cpu_to_be64( (u64) req->sector << 9); request.len = htonl(size); memcpy(request.handle, &req, sizeof(req)); down(&lo->tx_lock); if (!sock || !lo->sock) { FAIL("Attempted sendmsg to closed socket\n"); } result = nbd_xmit(1, sock, (char *) &request, sizeof(request), req->cmd == WRITE ? MSG_MORE : 0); if (result <= 0) FAIL("Sendmsg failed for control."); if (req->cmd == WRITE) { struct buffer_head *bh = req->bh; DEBUG("data, "); do { result = nbd_xmit(1, sock, bh->b_data, bh->b_size, bh->b_reqnext == NULL ? 0 : MSG_MORE); if (result <= 0) FAIL("Send data failed."); bh = bh->b_reqnext; } while(bh); } up(&lo->tx_lock); return; error_out: up(&lo->tx_lock); req->errors++; } static struct request *nbd_find_request(struct nbd_device *lo, char *handle) { struct request *req; struct list_head *tmp; struct request *xreq; memcpy(&xreq, handle, sizeof(xreq)); spin_lock(&lo->queue_lock); list_for_each(tmp, &lo->queue_head) { req = list_entry(tmp, struct request, queue); if (req != xreq) continue; list_del(&req->queue); spin_unlock(&lo->queue_lock); return req; } spin_unlock(&lo->queue_lock); return NULL; } #define HARDFAIL( s ) { printk( KERN_ERR "NBD: " s "(result %d)\n", result ); lo->harderror = result; return NULL; } struct request *nbd_read_stat(struct nbd_device *lo) /* NULL returned = something went wrong, inform userspace */ { int result; struct nbd_reply reply; struct request *req; DEBUG("reading control, "); reply.magic = 0; result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) HARDFAIL("Recv control failed."); req = nbd_find_request(lo, reply.handle); if (req == NULL) HARDFAIL("Unexpected reply"); DEBUG("ok, "); if (ntohl(reply.magic) != NBD_REPLY_MAGIC) HARDFAIL("Not enough magic."); if (ntohl(reply.error)) FAIL("Other side returned error."); if (req->cmd == READ) { struct buffer_head *bh = req->bh; DEBUG("data, "); do { result = nbd_xmit(0, lo->sock, bh->b_data, bh->b_size, MSG_WAITALL); if (result <= 0) HARDFAIL("Recv data failed."); bh = bh->b_reqnext; } while(bh); } DEBUG("done.\n"); return req; /* Can we get here? Yes, if other side returns error */ error_out: req->errors++; return req; } void nbd_do_it(struct nbd_device *lo) { struct request *req; while (1) { req = nbd_read_stat(lo); if (!req) { printk(KERN_ALERT "req should never be null\n" ); goto out; } #ifdef PARANOIA if (lo != &nbd_dev[MINOR(req->rq_dev)]) { printk(KERN_ALERT "NBD: request corrupted!\n"); continue; } if (lo->magic != LO_MAGIC) { printk(KERN_ALERT "NBD: nbd_dev[] corrupted: Not enough magic\n"); goto out; } #endif nbd_end_request(req); } out: return; } void nbd_clear_que(struct nbd_device *lo) { struct request *req; #ifdef PARANOIA if (lo->magic != LO_MAGIC) { printk(KERN_ERR "NBD: nbd_dev[] corrupted: Not enough magic when clearing!\n"); return; } #endif do { req = NULL; spin_lock(&lo->queue_lock); if (!list_empty(&lo->queue_head)) { req = list_entry(lo->queue_head.next, struct request, queue); list_del(&req->queue); } spin_unlock(&lo->queue_lock); if (req) { req->errors++; nbd_end_request(req); } } while(req); } /* * We always wait for result of write, for now. It would be nice to make it optional * in future * if ((req->cmd == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ #undef FAIL #define FAIL( s ) { printk( KERN_ERR "NBD, minor %d: " s "\n", dev ); goto error_out; } static void do_nbd_request(request_queue_t * q) { struct request *req; int dev = 0; struct nbd_device *lo; while (!QUEUE_EMPTY) { req = CURRENT; #ifdef PARANOIA if (!req) FAIL("que not empty but no request?"); #endif dev = MINOR(req->rq_dev); #ifdef PARANOIA if (dev >= MAX_NBD) FAIL("Minor too big."); /* Probably can not happen */ #endif lo = &nbd_dev[dev]; if (!lo->file) FAIL("Request when not-ready."); if ((req->cmd == WRITE) && (lo->flags & NBD_READ_ONLY)) FAIL("Write on read-only"); #ifdef PARANOIA if (lo->magic != LO_MAGIC) FAIL("nbd[] is not magical!"); requests_in++; #endif req->errors = 0; blkdev_dequeue_request(req); spin_unlock_irq(&io_request_lock); spin_lock(&lo->queue_lock); if (!lo->file) { spin_unlock(&lo->queue_lock); printk(KERN_ERR "nbd: failed between accept and semaphore, file lost\n"); req->errors++; nbd_end_request(req); spin_lock_irq(&io_request_lock); continue; } list_add_tail(&req->queue, &lo->queue_head); spin_unlock(&lo->queue_lock); nbd_send_req(lo, req); if (req->errors) { printk(KERN_ERR "nbd: nbd_send_req failed\n"); spin_lock(&lo->queue_lock); list_del(&req->queue); spin_unlock(&lo->queue_lock); nbd_end_request(req); spin_lock_irq(&io_request_lock); continue; } spin_lock_irq(&io_request_lock); continue; error_out: req->errors++; blkdev_dequeue_request(req); spin_unlock(&io_request_lock); nbd_end_request(req); spin_lock(&io_request_lock); } return; } static int nbd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct nbd_device *lo; int dev, error, temp; struct request sreq ; if (!inode) return -EINVAL; dev = MINOR(inode->i_rdev); if (dev >= MAX_NBD) return -ENODEV; lo = &nbd_dev[dev]; /* these are innocent, but.... */ switch (cmd) { case BLKGETSIZE: return put_user(nbd_bytesizes[dev] >> 9, (unsigned long *) arg); case BLKGETSIZE64: return put_user((u64)nbd_bytesizes[dev], (u64 *) arg); } /* ... anyone capable of any of the below ioctls can do *real bad* things */ if (!capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case NBD_DISCONNECT: printk("NBD_DISCONNECT\n"); sreq.cmd=2 ; /* shutdown command */ if (!lo->sock) return -EINVAL; nbd_send_req(lo, &sreq); return 0 ; case NBD_CLEAR_SOCK: error = 0; down(&lo->tx_lock); lo->sock = NULL; up(&lo->tx_lock); spin_lock(&lo->queue_lock); file = lo->file; lo->file = NULL; spin_unlock(&lo->queue_lock); nbd_clear_que(lo); spin_lock(&lo->queue_lock); if (!list_empty(&lo->queue_head)) { printk(KERN_ERR "nbd: disconnect: some requests are in progress -> please try again.\n"); error = -EBUSY; } spin_unlock(&lo->queue_lock); if (file) fput(file); return error; case NBD_SET_SOCK: if (lo->file) return -EBUSY; error = -EINVAL; file = fget(arg); if (file) { inode = file->f_dentry->d_inode; /* N.B. Should verify that it's a socket */ lo->file = file; lo->sock = &inode->u.socket_i; error = 0; } return error; case NBD_SET_BLKSIZE: if ((arg & (arg-1)) || (arg < 512) || (arg > PAGE_SIZE)) return -EINVAL; nbd_blksizes[dev] = arg; temp = arg >> 9; nbd_blksize_bits[dev] = 9; while (temp > 1) { nbd_blksize_bits[dev]++; temp >>= 1; } nbd_bytesizes[dev] &= ~(nbd_blksizes[dev]-1); nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS; return 0; case NBD_SET_SIZE: nbd_bytesizes[dev] = arg & ~(nbd_blksizes[dev]-1); nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS; return 0; case NBD_SET_SIZE_BLOCKS: nbd_bytesizes[dev] = ((u64) arg) << nbd_blksize_bits[dev]; nbd_sizes[dev] = nbd_bytesizes[dev] >> BLOCK_SIZE_BITS; return 0; case NBD_DO_IT: if (!lo->file) return -EINVAL; nbd_do_it(lo); /* on return tidy up in case we have a signal */ /* Forcibly shutdown the socket causing all listeners * to error * * FIXME: This code is duplicated from sys_shutdown, but * there should be a more generic interface rather than * calling socket ops directly here */ down(&lo->tx_lock); if (lo->sock) { printk(KERN_WARNING "nbd: shutting down socket\n"); lo->sock->ops->shutdown(lo->sock, SEND_SHUTDOWN|RCV_SHUTDOWN); lo->sock = NULL; } up(&lo->tx_lock); spin_lock(&lo->queue_lock); file = lo->file; lo->file = NULL; spin_unlock(&lo->queue_lock); nbd_clear_que(lo); printk(KERN_WARNING "nbd: queue cleared\n"); if (file) fput(file); return lo->harderror; case NBD_CLEAR_QUE: down(&lo->tx_lock); if (lo->sock) { up(&lo->tx_lock); return 0; /* probably should be error, but that would * break "nbd-client -d", so just return 0 */ } up(&lo->tx_lock); nbd_clear_que(lo); return 0; #ifdef PARANOIA case NBD_PRINT_DEBUG: printk(KERN_INFO "NBD device %d: next = %p, prev = %p. Global: in %d, out %d\n", dev, lo->queue_head.next, lo->queue_head.prev, requests_in, requests_out); return 0; #endif } return -EINVAL; } static int nbd_release(struct inode *inode, struct file *file) { struct nbd_device *lo; int dev; if (!inode) return -ENODEV; dev = MINOR(inode->i_rdev); if (dev >= MAX_NBD) return -ENODEV; lo = &nbd_dev[dev]; if (lo->refcnt <= 0) printk(KERN_ALERT "nbd_release: refcount(%d) <= 0\n", lo->refcnt); lo->refcnt--; /* N.B. Doesn't lo->file need an fput?? */ return 0; } static struct block_device_operations nbd_fops = { owner: THIS_MODULE, open: nbd_open, release: nbd_release, ioctl: nbd_ioctl, }; /* * And here should be modules and kernel interface * (Just smiley confuses emacs :-) */ static int __init nbd_init(void) { int i; if (sizeof(struct nbd_request) != 28) { printk(KERN_CRIT "Sizeof nbd_request needs to be 28 in order to work!\n" ); return -EIO; } if (register_blkdev(MAJOR_NR, "nbd", &nbd_fops)) { printk("Unable to get major number %d for NBD\n", MAJOR_NR); return -EIO; } #ifdef MODULE printk("nbd: registered device at major %d\n", MAJOR_NR); #endif blksize_size[MAJOR_NR] = nbd_blksizes; blk_size[MAJOR_NR] = nbd_sizes; blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request); blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); for (i = 0; i < MAX_NBD; i++) { nbd_dev[i].refcnt = 0; nbd_dev[i].file = NULL; nbd_dev[i].magic = LO_MAGIC; nbd_dev[i].flags = 0; spin_lock_init(&nbd_dev[i].queue_lock); INIT_LIST_HEAD(&nbd_dev[i].queue_head); init_MUTEX(&nbd_dev[i].tx_lock); nbd_blksizes[i] = 1024; nbd_blksize_bits[i] = 10; nbd_bytesizes[i] = ((u64)0x7ffffc00) << 10; /* 2TB */ nbd_sizes[i] = nbd_bytesizes[i] >> BLOCK_SIZE_BITS; register_disk(NULL, MKDEV(MAJOR_NR,i), 1, &nbd_fops, nbd_bytesizes[i]>>9); } devfs_handle = devfs_mk_dir (NULL, "nbd", NULL); devfs_register_series (devfs_handle, "%u", MAX_NBD, DEVFS_FL_DEFAULT, MAJOR_NR, 0, S_IFBLK | S_IRUSR | S_IWUSR, &nbd_fops, NULL); return 0; } static void __exit nbd_cleanup(void) { devfs_unregister (devfs_handle); blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR)); if (unregister_blkdev(MAJOR_NR, "nbd") != 0) printk("nbd: cleanup_module failed\n"); else printk("nbd: module cleaned up.\n"); } module_init(nbd_init); module_exit(nbd_cleanup); MODULE_DESCRIPTION("Network Block Device"); MODULE_LICENSE("GPL");