1 /*
2 * linux/drivers/block/ll_rw_blk.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 * Copyright (C) 1994, Karl Keyte: Added support for disk statistics
6 * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
7 * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8 * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
9 */
10
11 /*
12 * This handles all read/write requests to block devices
13 */
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/errno.h>
18 #include <linux/string.h>
19 #include <linux/config.h>
20 #include <linux/locks.h>
21 #include <linux/mm.h>
22 #include <linux/swap.h>
23 #include <linux/init.h>
24 #include <linux/smp_lock.h>
25 #include <linux/completion.h>
26 #include <linux/bootmem.h>
27
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <linux/blk.h>
31 #include <linux/highmem.h>
32 #include <linux/slab.h>
33 #include <linux/module.h>
34
35 /*
36 * MAC Floppy IWM hooks
37 */
38
39 #ifdef CONFIG_MAC_FLOPPY_IWM
40 extern int mac_floppy_init(void);
41 #endif
42
43 /*
44 * For the allocated request tables
45 */
46 static kmem_cache_t *request_cachep;
47
48 /*
49 * The "disk" task queue is used to start the actual requests
50 * after a plug
51 */
52 DECLARE_TASK_QUEUE(tq_disk);
53
54 /*
55 * Protect the request list against multiple users..
56 *
57 * With this spinlock the Linux block IO subsystem is 100% SMP threaded
58 * from the IRQ event side, and almost 100% SMP threaded from the syscall
59 * side (we still have protect against block device array operations, and
60 * the do_request() side is casually still unsafe. The kernel lock protects
61 * this part currently.).
62 *
63 * there is a fair chance that things will work just OK if these functions
64 * are called with no global kernel lock held ...
65 */
66 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
67
68 /* This specifies how many sectors to read ahead on the disk. */
69
70 int read_ahead[MAX_BLKDEV];
71
72 /* blk_dev_struct is:
73 * *request_fn
74 * *current_request
75 */
76 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
77
78 /*
79 * blk_size contains the size of all block-devices in units of 1024 byte
80 * sectors:
81 *
82 * blk_size[MAJOR][MINOR]
83 *
84 * if (!blk_size[MAJOR]) then no minor size checking is done.
85 */
86 int * blk_size[MAX_BLKDEV];
87
88 /*
89 * blksize_size contains the size of all block-devices:
90 *
91 * blksize_size[MAJOR][MINOR]
92 *
93 * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
94 */
95 int * blksize_size[MAX_BLKDEV];
96
97 /*
98 * hardsect_size contains the size of the hardware sector of a device.
99 *
100 * hardsect_size[MAJOR][MINOR]
101 *
102 * if (!hardsect_size[MAJOR])
103 * then 512 bytes is assumed.
104 * else
105 * sector_size is hardsect_size[MAJOR][MINOR]
106 * This is currently set by some scsi devices and read by the msdos fs driver.
107 * Other uses may appear later.
108 */
109 int * hardsect_size[MAX_BLKDEV];
110
111 /*
112 * The following tunes the read-ahead algorithm in mm/filemap.c
113 */
114 int * max_readahead[MAX_BLKDEV];
115
116 /*
117 * Max number of sectors per request
118 */
119 int * max_sectors[MAX_BLKDEV];
120
121 unsigned long blk_max_low_pfn, blk_max_pfn;
122 int blk_nohighio = 0;
123
124 int block_dump = 0;
125
126 static struct timer_list writeback_timer;
127
get_max_sectors(kdev_t dev)128 static inline int get_max_sectors(kdev_t dev)
129 {
130 if (!max_sectors[MAJOR(dev)])
131 return MAX_SECTORS;
132 return max_sectors[MAJOR(dev)][MINOR(dev)];
133 }
134
__blk_get_queue(kdev_t dev)135 static inline request_queue_t *__blk_get_queue(kdev_t dev)
136 {
137 struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
138
139 if (bdev->queue)
140 return bdev->queue(dev);
141 else
142 return &blk_dev[MAJOR(dev)].request_queue;
143 }
144
blk_get_queue(kdev_t dev)145 request_queue_t *blk_get_queue(kdev_t dev)
146 {
147 return __blk_get_queue(dev);
148 }
149
__blk_cleanup_queue(struct request_list * list)150 static int __blk_cleanup_queue(struct request_list *list)
151 {
152 struct list_head *head = &list->free;
153 struct request *rq;
154 int i = 0;
155
156 while (!list_empty(head)) {
157 rq = list_entry(head->next, struct request, queue);
158 list_del(&rq->queue);
159 kmem_cache_free(request_cachep, rq);
160 i++;
161 };
162
163 if (i != list->count)
164 printk("request list leak!\n");
165
166 list->count = 0;
167 return i;
168 }
169
170 /**
171 * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
172 * @q: the request queue to be released
173 *
174 * Description:
175 * blk_cleanup_queue is the pair to blk_init_queue(). It should
176 * be called when a request queue is being released; typically
177 * when a block device is being de-registered. Currently, its
178 * primary task it to free all the &struct request structures that
179 * were allocated to the queue.
180 * Caveat:
181 * Hopefully the low level driver will have finished any
182 * outstanding requests first...
183 **/
blk_cleanup_queue(request_queue_t * q)184 void blk_cleanup_queue(request_queue_t * q)
185 {
186 int count = q->nr_requests;
187
188 count -= __blk_cleanup_queue(&q->rq);
189
190 if (count)
191 printk("blk_cleanup_queue: leaked requests (%d)\n", count);
192 if (atomic_read(&q->nr_sectors))
193 printk("blk_cleanup_queue: leaked sectors (%d)\n", atomic_read(&q->nr_sectors));
194
195 memset(q, 0, sizeof(*q));
196 }
197
198 /**
199 * blk_queue_headactive - indicate whether head of request queue may be active
200 * @q: The queue which this applies to.
201 * @active: A flag indication where the head of the queue is active.
202 *
203 * Description:
204 * The driver for a block device may choose to leave the currently active
205 * request on the request queue, removing it only when it has completed.
206 * The queue handling routines assume this by default for safety reasons
207 * and will not involve the head of the request queue in any merging or
208 * reordering of requests when the queue is unplugged (and thus may be
209 * working on this particular request).
210 *
211 * If a driver removes requests from the queue before processing them, then
212 * it may indicate that it does so, there by allowing the head of the queue
213 * to be involved in merging and reordering. This is done be calling
214 * blk_queue_headactive() with an @active flag of %0.
215 *
216 * If a driver processes several requests at once, it must remove them (or
217 * at least all but one of them) from the request queue.
218 *
219 * When a queue is plugged the head will be assumed to be inactive.
220 **/
221
blk_queue_headactive(request_queue_t * q,int active)222 void blk_queue_headactive(request_queue_t * q, int active)
223 {
224 q->head_active = active;
225 }
226
227 /**
228 * blk_queue_throttle_sectors - indicates you will call sector throttling funcs
229 * @q: The queue which this applies to.
230 * @active: A flag indication if you want sector throttling on
231 *
232 * Description:
233 * The sector throttling code allows us to put a limit on the number of
234 * sectors pending io to the disk at a given time, sending @active nonzero
235 * indicates you will call blk_started_sectors and blk_finished_sectors in
236 * addition to calling blk_started_io and blk_finished_io in order to
237 * keep track of the number of sectors in flight.
238 **/
239
blk_queue_throttle_sectors(request_queue_t * q,int active)240 void blk_queue_throttle_sectors(request_queue_t * q, int active)
241 {
242 q->can_throttle = active;
243 }
244
245 /**
246 * blk_queue_make_request - define an alternate make_request function for a device
247 * @q: the request queue for the device to be affected
248 * @mfn: the alternate make_request function
249 *
250 * Description:
251 * The normal way for &struct buffer_heads to be passed to a device
252 * driver is for them to be collected into requests on a request
253 * queue, and then to allow the device driver to select requests
254 * off that queue when it is ready. This works well for many block
255 * devices. However some block devices (typically virtual devices
256 * such as md or lvm) do not benefit from the processing on the
257 * request queue, and are served best by having the requests passed
258 * directly to them. This can be achieved by providing a function
259 * to blk_queue_make_request().
260 *
261 * Caveat:
262 * The driver that does this *must* be able to deal appropriately
263 * with buffers in "highmemory", either by calling bh_kmap() to get
264 * a kernel mapping, to by calling create_bounce() to create a
265 * buffer in normal memory.
266 **/
267
blk_queue_make_request(request_queue_t * q,make_request_fn * mfn)268 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
269 {
270 q->make_request_fn = mfn;
271 }
272
273 /**
274 * blk_queue_bounce_limit - set bounce buffer limit for queue
275 * @q: the request queue for the device
276 * @dma_addr: bus address limit
277 *
278 * Description:
279 * Different hardware can have different requirements as to what pages
280 * it can do I/O directly to. A low level driver can call
281 * blk_queue_bounce_limit to have lower memory pages allocated as bounce
282 * buffers for doing I/O to pages residing above @page. By default
283 * the block layer sets this to the highest numbered "low" memory page.
284 **/
blk_queue_bounce_limit(request_queue_t * q,u64 dma_addr)285 void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
286 {
287 unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
288
289 q->bounce_pfn = bounce_pfn;
290 }
291
292
293 /*
294 * can we merge the two segments, or do we need to start a new one?
295 */
__blk_seg_merge_ok(struct buffer_head * bh,struct buffer_head * nxt)296 static inline int __blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt)
297 {
298 /*
299 * if bh and nxt are contigous and don't cross a 4g boundary, it's ok
300 */
301 if (BH_CONTIG(bh, nxt) && BH_PHYS_4G(bh, nxt))
302 return 1;
303
304 return 0;
305 }
306
blk_seg_merge_ok(struct buffer_head * bh,struct buffer_head * nxt)307 int blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt)
308 {
309 return __blk_seg_merge_ok(bh, nxt);
310 }
311
ll_new_segment(request_queue_t * q,struct request * req,int max_segments)312 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
313 {
314 if (req->nr_segments < max_segments) {
315 req->nr_segments++;
316 return 1;
317 }
318 return 0;
319 }
320
ll_back_merge_fn(request_queue_t * q,struct request * req,struct buffer_head * bh,int max_segments)321 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
322 struct buffer_head *bh, int max_segments)
323 {
324 if (__blk_seg_merge_ok(req->bhtail, bh))
325 return 1;
326
327 return ll_new_segment(q, req, max_segments);
328 }
329
ll_front_merge_fn(request_queue_t * q,struct request * req,struct buffer_head * bh,int max_segments)330 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
331 struct buffer_head *bh, int max_segments)
332 {
333 if (__blk_seg_merge_ok(bh, req->bh))
334 return 1;
335
336 return ll_new_segment(q, req, max_segments);
337 }
338
ll_merge_requests_fn(request_queue_t * q,struct request * req,struct request * next,int max_segments)339 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
340 struct request *next, int max_segments)
341 {
342 int total_segments = req->nr_segments + next->nr_segments;
343
344 if (__blk_seg_merge_ok(req->bhtail, next->bh))
345 total_segments--;
346
347 if (total_segments > max_segments)
348 return 0;
349
350 req->nr_segments = total_segments;
351 return 1;
352 }
353
354 /*
355 * "plug" the device if there are no outstanding requests: this will
356 * force the transfer to start only after we have put all the requests
357 * on the list.
358 *
359 * This is called with interrupts off and no requests on the queue.
360 * (and with the request spinlock acquired)
361 */
generic_plug_device(request_queue_t * q,kdev_t dev)362 static void generic_plug_device(request_queue_t *q, kdev_t dev)
363 {
364 /*
365 * no need to replug device
366 */
367 if (!list_empty(&q->queue_head) || q->plugged)
368 return;
369
370 q->plugged = 1;
371 queue_task(&q->plug_tq, &tq_disk);
372 }
373
374 /*
375 * remove the plug and let it rip..
376 */
__generic_unplug_device(request_queue_t * q)377 static inline void __generic_unplug_device(request_queue_t *q)
378 {
379 if (q->plugged) {
380 q->plugged = 0;
381 if (!list_empty(&q->queue_head))
382 q->request_fn(q);
383 }
384 }
385
generic_unplug_device(void * data)386 void generic_unplug_device(void *data)
387 {
388 request_queue_t *q = (request_queue_t *) data;
389 unsigned long flags;
390
391 spin_lock_irqsave(&io_request_lock, flags);
392 __generic_unplug_device(q);
393 spin_unlock_irqrestore(&io_request_lock, flags);
394 }
395
396 /** blk_grow_request_list
397 * @q: The &request_queue_t
398 * @nr_requests: how many requests are desired
399 *
400 * More free requests are added to the queue's free lists, bringing
401 * the total number of requests to @nr_requests.
402 *
403 * The requests are added equally to the request queue's read
404 * and write freelists.
405 *
406 * This function can sleep.
407 *
408 * Returns the (new) number of requests which the queue has available.
409 */
blk_grow_request_list(request_queue_t * q,int nr_requests,int max_queue_sectors)410 int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors)
411 {
412 unsigned long flags;
413 /* Several broken drivers assume that this function doesn't sleep,
414 * this causes system hangs during boot.
415 * As a temporary fix, make the function non-blocking.
416 */
417 spin_lock_irqsave(&io_request_lock, flags);
418 while (q->nr_requests < nr_requests) {
419 struct request *rq;
420
421 rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC);
422 if (rq == NULL)
423 break;
424 memset(rq, 0, sizeof(*rq));
425 rq->rq_status = RQ_INACTIVE;
426 list_add(&rq->queue, &q->rq.free);
427 q->rq.count++;
428
429 q->nr_requests++;
430 }
431
432 /*
433 * Wakeup waiters after both one quarter of the
434 * max-in-fligh queue and one quarter of the requests
435 * are available again.
436 */
437
438 q->batch_requests = q->nr_requests / 4;
439 if (q->batch_requests > 32)
440 q->batch_requests = 32;
441 q->batch_sectors = max_queue_sectors / 4;
442
443 q->max_queue_sectors = max_queue_sectors;
444
445 BUG_ON(!q->batch_sectors);
446 atomic_set(&q->nr_sectors, 0);
447
448 spin_unlock_irqrestore(&io_request_lock, flags);
449 return q->nr_requests;
450 }
451
blk_init_free_list(request_queue_t * q)452 static void blk_init_free_list(request_queue_t *q)
453 {
454 struct sysinfo si;
455 int megs; /* Total memory, in megabytes */
456 int nr_requests, max_queue_sectors = MAX_QUEUE_SECTORS;
457
458 INIT_LIST_HEAD(&q->rq.free);
459 q->rq.count = 0;
460 q->rq.pending[READ] = q->rq.pending[WRITE] = 0;
461 q->nr_requests = 0;
462
463 si_meminfo(&si);
464 megs = si.totalram >> (20 - PAGE_SHIFT);
465 nr_requests = MAX_NR_REQUESTS;
466 if (megs < 30) {
467 nr_requests /= 2;
468 max_queue_sectors /= 2;
469 }
470 /* notice early if anybody screwed the defaults */
471 BUG_ON(!nr_requests);
472 BUG_ON(!max_queue_sectors);
473
474 blk_grow_request_list(q, nr_requests, max_queue_sectors);
475
476 init_waitqueue_head(&q->wait_for_requests);
477
478 spin_lock_init(&q->queue_lock);
479 }
480
481 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
482
483 /**
484 * blk_init_queue - prepare a request queue for use with a block device
485 * @q: The &request_queue_t to be initialised
486 * @rfn: The function to be called to process requests that have been
487 * placed on the queue.
488 *
489 * Description:
490 * If a block device wishes to use the standard request handling procedures,
491 * which sorts requests and coalesces adjacent requests, then it must
492 * call blk_init_queue(). The function @rfn will be called when there
493 * are requests on the queue that need to be processed. If the device
494 * supports plugging, then @rfn may not be called immediately when requests
495 * are available on the queue, but may be called at some time later instead.
496 * Plugged queues are generally unplugged when a buffer belonging to one
497 * of the requests on the queue is needed, or due to memory pressure.
498 *
499 * @rfn is not required, or even expected, to remove all requests off the
500 * queue, but only as many as it can handle at a time. If it does leave
501 * requests on the queue, it is responsible for arranging that the requests
502 * get dealt with eventually.
503 *
504 * A global spin lock $io_request_lock must be held while manipulating the
505 * requests on the request queue.
506 *
507 * The request on the head of the queue is by default assumed to be
508 * potentially active, and it is not considered for re-ordering or merging
509 * whenever the given queue is unplugged. This behaviour can be changed with
510 * blk_queue_headactive().
511 *
512 * Note:
513 * blk_init_queue() must be paired with a blk_cleanup_queue() call
514 * when the block device is deactivated (such as at module unload).
515 **/
blk_init_queue(request_queue_t * q,request_fn_proc * rfn)516 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
517 {
518 INIT_LIST_HEAD(&q->queue_head);
519 elevator_init(&q->elevator, ELEVATOR_LINUS);
520 blk_init_free_list(q);
521 q->request_fn = rfn;
522 q->back_merge_fn = ll_back_merge_fn;
523 q->front_merge_fn = ll_front_merge_fn;
524 q->merge_requests_fn = ll_merge_requests_fn;
525 q->make_request_fn = __make_request;
526 q->plug_tq.sync = 0;
527 q->plug_tq.routine = &generic_unplug_device;
528 q->plug_tq.data = q;
529 q->plugged = 0;
530 q->can_throttle = 0;
531
532 /*
533 * These booleans describe the queue properties. We set the
534 * default (and most common) values here. Other drivers can
535 * use the appropriate functions to alter the queue properties.
536 * as appropriate.
537 */
538 q->plug_device_fn = generic_plug_device;
539 q->head_active = 1;
540
541 blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
542 }
543
544 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queue);
545 /*
546 * Get a free request. io_request_lock must be held and interrupts
547 * disabled on the way in. Returns NULL if there are no free requests.
548 */
get_request(request_queue_t * q,int rw)549 static struct request *get_request(request_queue_t *q, int rw)
550 {
551 struct request *rq = NULL;
552 struct request_list *rl = &q->rq;
553
554 if (blk_oversized_queue(q)) {
555 int rlim = q->nr_requests >> 5;
556
557 if (rlim < 4)
558 rlim = 4;
559
560 /*
561 * if its a write, or we have more than a handful of reads
562 * pending, bail out
563 */
564 if ((rw == WRITE) || (rw == READ && rl->pending[READ] > rlim))
565 return NULL;
566 if (blk_oversized_queue_reads(q))
567 return NULL;
568 }
569
570 if (!list_empty(&rl->free)) {
571 rq = blkdev_free_rq(&rl->free);
572 list_del(&rq->queue);
573 rl->count--;
574 rl->pending[rw]++;
575 rq->rq_status = RQ_ACTIVE;
576 rq->cmd = rw;
577 rq->special = NULL;
578 rq->io_account = 0;
579 rq->q = q;
580 }
581
582 return rq;
583 }
584
585 /*
586 * Here's the request allocation design, low latency version:
587 *
588 * 1: Blocking on request exhaustion is a key part of I/O throttling.
589 *
590 * 2: We want to be `fair' to all requesters. We must avoid starvation, and
591 * attempt to ensure that all requesters sleep for a similar duration. Hence
592 * no stealing requests when there are other processes waiting.
593 *
594 * There used to be more here, attempting to allow a process to send in a
595 * number of requests once it has woken up. But, there's no way to
596 * tell if a process has just been woken up, or if it is a new process
597 * coming in to steal requests from the waiters. So, we give up and force
598 * everyone to wait fairly.
599 *
600 * So here's what we do:
601 *
602 * a) A READA requester fails if free_requests < batch_requests
603 *
604 * We don't want READA requests to prevent sleepers from ever
605 * waking. Note that READA is used extremely rarely - a few
606 * filesystems use it for directory readahead.
607 *
608 * When a process wants a new request:
609 *
610 * b) If free_requests == 0, the requester sleeps in FIFO manner, and
611 * the queue full condition is set. The full condition is not
612 * cleared until there are no longer any waiters. Once the full
613 * condition is set, all new io must wait, hopefully for a very
614 * short period of time.
615 *
616 * When a request is released:
617 *
618 * c) If free_requests < batch_requests, do nothing.
619 *
620 * d) If free_requests >= batch_requests, wake up a single waiter.
621 *
622 * As each waiter gets a request, he wakes another waiter. We do this
623 * to prevent a race where an unplug might get run before a request makes
624 * it's way onto the queue. The result is a cascade of wakeups, so delaying
625 * the initial wakeup until we've got batch_requests available helps avoid
626 * wakeups where there aren't any requests available yet.
627 */
628
__get_request_wait(request_queue_t * q,int rw)629 static struct request *__get_request_wait(request_queue_t *q, int rw)
630 {
631 register struct request *rq;
632 DECLARE_WAITQUEUE(wait, current);
633
634 add_wait_queue_exclusive(&q->wait_for_requests, &wait);
635
636 do {
637 set_current_state(TASK_UNINTERRUPTIBLE);
638 spin_lock_irq(&io_request_lock);
639 if (blk_oversized_queue(q) || q->rq.count == 0) {
640 __generic_unplug_device(q);
641 spin_unlock_irq(&io_request_lock);
642 schedule();
643 spin_lock_irq(&io_request_lock);
644 }
645 rq = get_request(q, rw);
646 spin_unlock_irq(&io_request_lock);
647 } while (rq == NULL);
648 remove_wait_queue(&q->wait_for_requests, &wait);
649 current->state = TASK_RUNNING;
650
651 return rq;
652 }
653
get_request_wait_wakeup(request_queue_t * q,int rw)654 static void get_request_wait_wakeup(request_queue_t *q, int rw)
655 {
656 /*
657 * avoid losing an unplug if a second __get_request_wait did the
658 * generic_unplug_device while our __get_request_wait was running
659 * w/o the queue_lock held and w/ our request out of the queue.
660 */
661 if (waitqueue_active(&q->wait_for_requests))
662 wake_up(&q->wait_for_requests);
663 }
664
665 /* RO fail safe mechanism */
666
667 static long ro_bits[MAX_BLKDEV][8];
668
is_read_only(kdev_t dev)669 int is_read_only(kdev_t dev)
670 {
671 int minor,major;
672
673 major = MAJOR(dev);
674 minor = MINOR(dev);
675 if (major < 0 || major >= MAX_BLKDEV) return 0;
676 return ro_bits[major][minor >> 5] & (1 << (minor & 31));
677 }
678
set_device_ro(kdev_t dev,int flag)679 void set_device_ro(kdev_t dev,int flag)
680 {
681 int minor,major;
682
683 major = MAJOR(dev);
684 minor = MINOR(dev);
685 if (major < 0 || major >= MAX_BLKDEV) return;
686 if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
687 else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
688 }
689
drive_stat_acct(kdev_t dev,int rw,unsigned long nr_sectors,int new_io)690 inline void drive_stat_acct (kdev_t dev, int rw,
691 unsigned long nr_sectors, int new_io)
692 {
693 unsigned int major = MAJOR(dev);
694 unsigned int index;
695
696 index = disk_index(dev);
697 if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
698 return;
699
700 kstat.dk_drive[major][index] += new_io;
701 if (rw == READ) {
702 kstat.dk_drive_rio[major][index] += new_io;
703 kstat.dk_drive_rblk[major][index] += nr_sectors;
704 } else if (rw == WRITE) {
705 kstat.dk_drive_wio[major][index] += new_io;
706 kstat.dk_drive_wblk[major][index] += nr_sectors;
707 } else
708 printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
709 }
710
711 #ifdef CONFIG_BLK_STATS
712 /*
713 * Return up to two hd_structs on which to do IO accounting for a given
714 * request.
715 *
716 * On a partitioned device, we want to account both against the partition
717 * and against the whole disk.
718 */
locate_hd_struct(struct request * req,struct hd_struct ** hd1,struct hd_struct ** hd2)719 static void locate_hd_struct(struct request *req,
720 struct hd_struct **hd1,
721 struct hd_struct **hd2)
722 {
723 struct gendisk *gd;
724
725 *hd1 = NULL;
726 *hd2 = NULL;
727
728 gd = get_gendisk(req->rq_dev);
729 if (gd && gd->part) {
730 /* Mask out the partition bits: account for the entire disk */
731 int devnr = MINOR(req->rq_dev) >> gd->minor_shift;
732 int whole_minor = devnr << gd->minor_shift;
733
734 *hd1 = &gd->part[whole_minor];
735 if (whole_minor != MINOR(req->rq_dev))
736 *hd2= &gd->part[MINOR(req->rq_dev)];
737 }
738 }
739
740 /*
741 * Round off the performance stats on an hd_struct.
742 *
743 * The average IO queue length and utilisation statistics are maintained
744 * by observing the current state of the queue length and the amount of
745 * time it has been in this state for.
746 * Normally, that accounting is done on IO completion, but that can result
747 * in more than a second's worth of IO being accounted for within any one
748 * second, leading to >100% utilisation. To deal with that, we do a
749 * round-off before returning the results when reading /proc/partitions,
750 * accounting immediately for all queue usage up to the current jiffies and
751 * restarting the counters again.
752 */
disk_round_stats(struct hd_struct * hd)753 void disk_round_stats(struct hd_struct *hd)
754 {
755 unsigned long now = jiffies;
756
757 hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change));
758 hd->last_queue_change = now;
759
760 if (hd->ios_in_flight)
761 hd->io_ticks += (now - hd->last_idle_time);
762 hd->last_idle_time = now;
763 }
764
down_ios(struct hd_struct * hd)765 static inline void down_ios(struct hd_struct *hd)
766 {
767 disk_round_stats(hd);
768 --hd->ios_in_flight;
769 }
770
up_ios(struct hd_struct * hd)771 static inline void up_ios(struct hd_struct *hd)
772 {
773 disk_round_stats(hd);
774 ++hd->ios_in_flight;
775 }
776
account_io_start(struct hd_struct * hd,struct request * req,int merge,int sectors)777 static void account_io_start(struct hd_struct *hd, struct request *req,
778 int merge, int sectors)
779 {
780 switch (req->cmd) {
781 case READ:
782 if (merge)
783 hd->rd_merges++;
784 hd->rd_sectors += sectors;
785 break;
786 case WRITE:
787 if (merge)
788 hd->wr_merges++;
789 hd->wr_sectors += sectors;
790 break;
791 }
792 if (!merge)
793 up_ios(hd);
794 }
795
account_io_end(struct hd_struct * hd,struct request * req)796 static void account_io_end(struct hd_struct *hd, struct request *req)
797 {
798 unsigned long duration = jiffies - req->start_time;
799 switch (req->cmd) {
800 case READ:
801 hd->rd_ticks += duration;
802 hd->rd_ios++;
803 break;
804 case WRITE:
805 hd->wr_ticks += duration;
806 hd->wr_ios++;
807 break;
808 }
809 down_ios(hd);
810 }
811
req_new_io(struct request * req,int merge,int sectors)812 void req_new_io(struct request *req, int merge, int sectors)
813 {
814 struct hd_struct *hd1, *hd2;
815
816 locate_hd_struct(req, &hd1, &hd2);
817 req->io_account = 1;
818 if (hd1)
819 account_io_start(hd1, req, merge, sectors);
820 if (hd2)
821 account_io_start(hd2, req, merge, sectors);
822 }
823
req_merged_io(struct request * req)824 void req_merged_io(struct request *req)
825 {
826 struct hd_struct *hd1, *hd2;
827
828 if (unlikely(req->io_account == 0))
829 return;
830 locate_hd_struct(req, &hd1, &hd2);
831 if (hd1)
832 down_ios(hd1);
833 if (hd2)
834 down_ios(hd2);
835 }
836
req_finished_io(struct request * req)837 void req_finished_io(struct request *req)
838 {
839 struct hd_struct *hd1, *hd2;
840
841 if (unlikely(req->io_account == 0))
842 return;
843 locate_hd_struct(req, &hd1, &hd2);
844 if (hd1)
845 account_io_end(hd1, req);
846 if (hd2)
847 account_io_end(hd2, req);
848 }
849 EXPORT_SYMBOL(req_finished_io);
850 #endif /* CONFIG_BLK_STATS */
851
852 /*
853 * add-request adds a request to the linked list.
854 * io_request_lock is held and interrupts disabled, as we muck with the
855 * request queue list.
856 *
857 * By this point, req->cmd is always either READ/WRITE, never READA,
858 * which is important for drive_stat_acct() above.
859 */
add_request(request_queue_t * q,struct request * req,struct list_head * insert_here)860 static inline void add_request(request_queue_t * q, struct request * req,
861 struct list_head *insert_here)
862 {
863 drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
864
865 if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
866 spin_unlock_irq(&io_request_lock);
867 BUG();
868 }
869
870 /*
871 * elevator indicated where it wants this request to be
872 * inserted at elevator_merge time
873 */
874 list_add(&req->queue, insert_here);
875 }
876
877 /*
878 * Must be called with io_request_lock held and interrupts disabled
879 */
blkdev_release_request(struct request * req)880 void blkdev_release_request(struct request *req)
881 {
882 request_queue_t *q = req->q;
883
884 req->rq_status = RQ_INACTIVE;
885 req->q = NULL;
886
887 /*
888 * Request may not have originated from ll_rw_blk. if not,
889 * assume it has free buffers and check waiters
890 */
891 if (q) {
892 struct request_list *rl = &q->rq;
893 int oversized_batch = 0;
894
895 if (q->can_throttle)
896 oversized_batch = blk_oversized_queue_batch(q);
897 rl->count++;
898 /*
899 * paranoia check
900 */
901 if (req->cmd == READ || req->cmd == WRITE)
902 rl->pending[req->cmd]--;
903 if (rl->pending[READ] > q->nr_requests)
904 printk("blk: reads: %u\n", rl->pending[READ]);
905 if (rl->pending[WRITE] > q->nr_requests)
906 printk("blk: writes: %u\n", rl->pending[WRITE]);
907 if (rl->pending[READ] + rl->pending[WRITE] > q->nr_requests)
908 printk("blk: r/w: %u + %u > %u\n", rl->pending[READ], rl->pending[WRITE], q->nr_requests);
909 list_add(&req->queue, &rl->free);
910 if (rl->count >= q->batch_requests && !oversized_batch) {
911 smp_mb();
912 if (waitqueue_active(&q->wait_for_requests))
913 wake_up(&q->wait_for_requests);
914 }
915 }
916 }
917
918 /*
919 * Has to be called with the request spinlock acquired
920 */
attempt_merge(request_queue_t * q,struct request * req,int max_sectors,int max_segments)921 static void attempt_merge(request_queue_t * q,
922 struct request *req,
923 int max_sectors,
924 int max_segments)
925 {
926 struct request *next;
927
928 next = blkdev_next_request(req);
929 if (req->sector + req->nr_sectors != next->sector)
930 return;
931 if (req->cmd != next->cmd
932 || req->rq_dev != next->rq_dev
933 || req->nr_sectors + next->nr_sectors > max_sectors
934 || next->waiting)
935 return;
936 /*
937 * If we are not allowed to merge these requests, then
938 * return. If we are allowed to merge, then the count
939 * will have been updated to the appropriate number,
940 * and we shouldn't do it here too.
941 */
942 if (!q->merge_requests_fn(q, req, next, max_segments))
943 return;
944
945 q->elevator.elevator_merge_req_fn(req, next);
946
947 /* At this point we have either done a back merge
948 * or front merge. We need the smaller start_time of
949 * the merged requests to be the current request
950 * for accounting purposes.
951 */
952 if (time_after(req->start_time, next->start_time))
953 req->start_time = next->start_time;
954
955 req->bhtail->b_reqnext = next->bh;
956 req->bhtail = next->bhtail;
957 req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
958 list_del(&next->queue);
959
960 /* One last thing: we have removed a request, so we now have one
961 less expected IO to complete for accounting purposes. */
962 req_merged_io(req);
963
964 blkdev_release_request(next);
965 }
966
attempt_back_merge(request_queue_t * q,struct request * req,int max_sectors,int max_segments)967 static inline void attempt_back_merge(request_queue_t * q,
968 struct request *req,
969 int max_sectors,
970 int max_segments)
971 {
972 if (&req->queue == q->queue_head.prev)
973 return;
974 attempt_merge(q, req, max_sectors, max_segments);
975 }
976
attempt_front_merge(request_queue_t * q,struct list_head * head,struct request * req,int max_sectors,int max_segments)977 static inline void attempt_front_merge(request_queue_t * q,
978 struct list_head * head,
979 struct request *req,
980 int max_sectors,
981 int max_segments)
982 {
983 struct list_head * prev;
984
985 prev = req->queue.prev;
986 if (head == prev)
987 return;
988 attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
989 }
990
__make_request(request_queue_t * q,int rw,struct buffer_head * bh)991 static int __make_request(request_queue_t * q, int rw,
992 struct buffer_head * bh)
993 {
994 unsigned int sector, count, sync;
995 int max_segments = MAX_SEGMENTS;
996 struct request * req, *freereq = NULL;
997 int rw_ahead, max_sectors, el_ret;
998 struct list_head *head, *insert_here;
999 int latency;
1000 elevator_t *elevator = &q->elevator;
1001 int should_wake = 0;
1002
1003 count = bh->b_size >> 9;
1004 sector = bh->b_rsector;
1005 sync = test_and_clear_bit(BH_Sync, &bh->b_state);
1006
1007 rw_ahead = 0; /* normal case; gets changed below for READA */
1008 switch (rw) {
1009 case READA:
1010 #if 0 /* bread() misinterprets failed READA attempts as IO errors on SMP */
1011 rw_ahead = 1;
1012 #endif
1013 rw = READ; /* drop into READ */
1014 case READ:
1015 case WRITE:
1016 latency = elevator_request_latency(elevator, rw);
1017 break;
1018 default:
1019 BUG();
1020 goto end_io;
1021 }
1022
1023 /* We'd better have a real physical mapping!
1024 Check this bit only if the buffer was dirty and just locked
1025 down by us so at this point flushpage will block and
1026 won't clear the mapped bit under us. */
1027 if (!buffer_mapped(bh))
1028 BUG();
1029
1030 /*
1031 * Temporary solution - in 2.5 this will be done by the lowlevel
1032 * driver. Create a bounce buffer if the buffer data points into
1033 * high memory - keep the original buffer otherwise.
1034 */
1035 bh = blk_queue_bounce(q, rw, bh);
1036
1037 /* look for a free request. */
1038 /*
1039 * Try to coalesce the new request with old requests
1040 */
1041 max_sectors = get_max_sectors(bh->b_rdev);
1042
1043 req = NULL;
1044 head = &q->queue_head;
1045 /*
1046 * Now we acquire the request spinlock, we have to be mega careful
1047 * not to schedule or do something nonatomic
1048 */
1049 spin_lock_irq(&io_request_lock);
1050
1051 again:
1052 insert_here = head->prev;
1053
1054 if (list_empty(head)) {
1055 q->plug_device_fn(q, bh->b_rdev); /* is atomic */
1056 goto get_rq;
1057 } else if (q->head_active && !q->plugged)
1058 head = head->next;
1059
1060 el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
1061 switch (el_ret) {
1062
1063 case ELEVATOR_BACK_MERGE:
1064 if (!q->back_merge_fn(q, req, bh, max_segments)) {
1065 insert_here = &req->queue;
1066 break;
1067 }
1068 req->bhtail->b_reqnext = bh;
1069 req->bhtail = bh;
1070 req->nr_sectors = req->hard_nr_sectors += count;
1071 blk_started_io(count);
1072 blk_started_sectors(req, count);
1073 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
1074 req_new_io(req, 1, count);
1075 attempt_back_merge(q, req, max_sectors, max_segments);
1076 goto out;
1077
1078 case ELEVATOR_FRONT_MERGE:
1079 if (!q->front_merge_fn(q, req, bh, max_segments)) {
1080 insert_here = req->queue.prev;
1081 break;
1082 }
1083 bh->b_reqnext = req->bh;
1084 req->bh = bh;
1085 /*
1086 * may not be valid, but queues not having bounce
1087 * enabled for highmem pages must not look at
1088 * ->buffer anyway
1089 */
1090 req->buffer = bh->b_data;
1091 req->current_nr_sectors = req->hard_cur_sectors = count;
1092 req->sector = req->hard_sector = sector;
1093 req->nr_sectors = req->hard_nr_sectors += count;
1094 blk_started_io(count);
1095 blk_started_sectors(req, count);
1096 drive_stat_acct(req->rq_dev, req->cmd, count, 0);
1097 req_new_io(req, 1, count);
1098 attempt_front_merge(q, head, req, max_sectors, max_segments);
1099 goto out;
1100
1101 /*
1102 * elevator says don't/can't merge. get new request
1103 */
1104 case ELEVATOR_NO_MERGE:
1105 /*
1106 * use elevator hints as to where to insert the
1107 * request. if no hints, just add it to the back
1108 * of the queue
1109 */
1110 if (req)
1111 insert_here = &req->queue;
1112 break;
1113
1114 default:
1115 printk("elevator returned crap (%d)\n", el_ret);
1116 BUG();
1117 }
1118
1119 get_rq:
1120 if (freereq) {
1121 req = freereq;
1122 freereq = NULL;
1123 } else {
1124 /*
1125 * See description above __get_request_wait()
1126 */
1127 if (rw_ahead) {
1128 if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) {
1129 spin_unlock_irq(&io_request_lock);
1130 goto end_io;
1131 }
1132 req = get_request(q, rw);
1133 if (req == NULL)
1134 BUG();
1135 } else {
1136 req = get_request(q, rw);
1137 if (req == NULL) {
1138 spin_unlock_irq(&io_request_lock);
1139 freereq = __get_request_wait(q, rw);
1140 head = &q->queue_head;
1141 spin_lock_irq(&io_request_lock);
1142 should_wake = 1;
1143 goto again;
1144 }
1145 }
1146 }
1147
1148 /* fill up the request-info, and add it to the queue */
1149 req->elevator_sequence = latency;
1150 req->cmd = rw;
1151 req->errors = 0;
1152 req->hard_sector = req->sector = sector;
1153 req->hard_nr_sectors = req->nr_sectors = count;
1154 req->current_nr_sectors = req->hard_cur_sectors = count;
1155 req->nr_segments = 1; /* Always 1 for a new request. */
1156 req->nr_hw_segments = 1; /* Always 1 for a new request. */
1157 req->buffer = bh->b_data;
1158 req->waiting = NULL;
1159 req->bh = bh;
1160 req->bhtail = bh;
1161 req->rq_dev = bh->b_rdev;
1162 req->start_time = jiffies;
1163 req_new_io(req, 0, count);
1164 blk_started_io(count);
1165 blk_started_sectors(req, count);
1166 add_request(q, req, insert_here);
1167 out:
1168 if (freereq)
1169 blkdev_release_request(freereq);
1170 if (should_wake)
1171 get_request_wait_wakeup(q, rw);
1172 if (sync)
1173 __generic_unplug_device(q);
1174 spin_unlock_irq(&io_request_lock);
1175 return 0;
1176 end_io:
1177 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1178 return 0;
1179 }
1180
1181 /**
1182 * generic_make_request: hand a buffer head to it's device driver for I/O
1183 * @rw: READ, WRITE, or READA - what sort of I/O is desired.
1184 * @bh: The buffer head describing the location in memory and on the device.
1185 *
1186 * generic_make_request() is used to make I/O requests of block
1187 * devices. It is passed a &struct buffer_head and a &rw value. The
1188 * %READ and %WRITE options are (hopefully) obvious in meaning. The
1189 * %READA value means that a read is required, but that the driver is
1190 * free to fail the request if, for example, it cannot get needed
1191 * resources immediately.
1192 *
1193 * generic_make_request() does not return any status. The
1194 * success/failure status of the request, along with notification of
1195 * completion, is delivered asynchronously through the bh->b_end_io
1196 * function described (one day) else where.
1197 *
1198 * The caller of generic_make_request must make sure that b_page,
1199 * b_addr, b_size are set to describe the memory buffer, that b_rdev
1200 * and b_rsector are set to describe the device address, and the
1201 * b_end_io and optionally b_private are set to describe how
1202 * completion notification should be signaled. BH_Mapped should also
1203 * be set (to confirm that b_dev and b_blocknr are valid).
1204 *
1205 * generic_make_request and the drivers it calls may use b_reqnext,
1206 * and may change b_rdev and b_rsector. So the values of these fields
1207 * should NOT be depended on after the call to generic_make_request.
1208 * Because of this, the caller should record the device address
1209 * information in b_dev and b_blocknr.
1210 *
1211 * Apart from those fields mentioned above, no other fields, and in
1212 * particular, no other flags, are changed by generic_make_request or
1213 * any lower level drivers.
1214 * */
generic_make_request(int rw,struct buffer_head * bh)1215 void generic_make_request (int rw, struct buffer_head * bh)
1216 {
1217 int major = MAJOR(bh->b_rdev);
1218 int minorsize = 0;
1219 request_queue_t *q;
1220
1221 if (!bh->b_end_io)
1222 BUG();
1223
1224 /* Test device size, when known. */
1225 if (blk_size[major])
1226 minorsize = blk_size[major][MINOR(bh->b_rdev)];
1227 if (minorsize) {
1228 unsigned long maxsector = (minorsize << 1) + 1;
1229 unsigned long sector = bh->b_rsector;
1230 unsigned int count = bh->b_size >> 9;
1231
1232 if (maxsector < count || maxsector - count < sector) {
1233 /* Yecch */
1234 bh->b_state &= ~(1 << BH_Dirty);
1235
1236 /* This may well happen - the kernel calls bread()
1237 without checking the size of the device, e.g.,
1238 when mounting a device. */
1239 printk(KERN_INFO
1240 "attempt to access beyond end of device\n");
1241 printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
1242 kdevname(bh->b_rdev), rw,
1243 (sector + count)>>1, minorsize);
1244
1245 bh->b_end_io(bh, 0);
1246 return;
1247 }
1248 }
1249
1250 /*
1251 * Resolve the mapping until finished. (drivers are
1252 * still free to implement/resolve their own stacking
1253 * by explicitly returning 0)
1254 */
1255 /* NOTE: we don't repeat the blk_size check for each new device.
1256 * Stacking drivers are expected to know what they are doing.
1257 */
1258 do {
1259 q = __blk_get_queue(bh->b_rdev);
1260 if (!q) {
1261 printk(KERN_ERR
1262 "generic_make_request: Trying to access "
1263 "nonexistent block-device %s (%ld)\n",
1264 kdevname(bh->b_rdev), bh->b_rsector);
1265 buffer_IO_error(bh);
1266 break;
1267 }
1268 } while (q->make_request_fn(q, rw, bh));
1269 }
1270
1271
1272 /**
1273 * submit_bh: submit a buffer_head to the block device later for I/O
1274 * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1275 * @bh: The &struct buffer_head which describes the I/O
1276 *
1277 * submit_bh() is very similar in purpose to generic_make_request(), and
1278 * uses that function to do most of the work.
1279 *
1280 * The extra functionality provided by submit_bh is to determine
1281 * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
1282 * This is is appropriate for IO requests that come from the buffer
1283 * cache and page cache which (currently) always use aligned blocks.
1284 */
submit_bh(int rw,struct buffer_head * bh)1285 void submit_bh(int rw, struct buffer_head * bh)
1286 {
1287 int count = bh->b_size >> 9;
1288
1289 if (!test_bit(BH_Lock, &bh->b_state))
1290 BUG();
1291
1292 set_bit(BH_Req, &bh->b_state);
1293 set_bit(BH_Launder, &bh->b_state);
1294
1295 /*
1296 * First step, 'identity mapping' - RAID or LVM might
1297 * further remap this.
1298 */
1299 bh->b_rdev = bh->b_dev;
1300 bh->b_rsector = bh->b_blocknr * count;
1301
1302 get_bh(bh);
1303 generic_make_request(rw, bh);
1304
1305 /* fix race condition with wait_on_buffer() */
1306 smp_mb(); /* spin_unlock may have inclusive semantics */
1307 if (waitqueue_active(&bh->b_wait))
1308 wake_up(&bh->b_wait);
1309
1310 if (block_dump)
1311 printk(KERN_DEBUG "%s: %s block %lu/%u on %s\n", current->comm, rw == WRITE ? "WRITE" : "READ", bh->b_rsector, count, kdevname(bh->b_rdev));
1312
1313 put_bh(bh);
1314 switch (rw) {
1315 case WRITE:
1316 kstat.pgpgout += count;
1317 break;
1318 default:
1319 kstat.pgpgin += count;
1320 break;
1321 }
1322 }
1323
1324 /**
1325 * ll_rw_block: low-level access to block devices
1326 * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
1327 * @nr: number of &struct buffer_heads in the array
1328 * @bhs: array of pointers to &struct buffer_head
1329 *
1330 * ll_rw_block() takes an array of pointers to &struct buffer_heads,
1331 * and requests an I/O operation on them, either a %READ or a %WRITE.
1332 * The third %READA option is described in the documentation for
1333 * generic_make_request() which ll_rw_block() calls.
1334 *
1335 * This function provides extra functionality that is not in
1336 * generic_make_request() that is relevant to buffers in the buffer
1337 * cache or page cache. In particular it drops any buffer that it
1338 * cannot get a lock on (with the BH_Lock state bit), any buffer that
1339 * appears to be clean when doing a write request, and any buffer that
1340 * appears to be up-to-date when doing read request. Further it marks
1341 * as clean buffers that are processed for writing (the buffer cache
1342 * wont assume that they are actually clean until the buffer gets
1343 * unlocked).
1344 *
1345 * ll_rw_block sets b_end_io to simple completion handler that marks
1346 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1347 * any waiters. As client that needs a more interesting completion
1348 * routine should call submit_bh() (or generic_make_request())
1349 * directly.
1350 *
1351 * Caveat:
1352 * All of the buffers must be for the same device, and must also be
1353 * of the current approved size for the device. */
1354
ll_rw_block(int rw,int nr,struct buffer_head * bhs[])1355 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1356 {
1357 unsigned int major;
1358 int correct_size;
1359 int i;
1360
1361 if (!nr)
1362 return;
1363
1364 major = MAJOR(bhs[0]->b_dev);
1365
1366 /* Determine correct block size for this device. */
1367 correct_size = get_hardsect_size(bhs[0]->b_dev);
1368
1369 /* Verify requested block sizes. */
1370 for (i = 0; i < nr; i++) {
1371 struct buffer_head *bh = bhs[i];
1372 if (bh->b_size % correct_size) {
1373 printk(KERN_NOTICE "ll_rw_block: device %s: "
1374 "only %d-char blocks implemented (%u)\n",
1375 kdevname(bhs[0]->b_dev),
1376 correct_size, bh->b_size);
1377 goto sorry;
1378 }
1379 }
1380
1381 if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1382 printk(KERN_NOTICE "Can't write to read-only device %s\n",
1383 kdevname(bhs[0]->b_dev));
1384 goto sorry;
1385 }
1386
1387 for (i = 0; i < nr; i++) {
1388 struct buffer_head *bh = bhs[i];
1389
1390 lock_buffer(bh);
1391
1392 /* We have the buffer lock */
1393 atomic_inc(&bh->b_count);
1394 bh->b_end_io = end_buffer_io_sync;
1395
1396 switch(rw) {
1397 case WRITE:
1398 if (!atomic_set_buffer_clean(bh))
1399 /* Hmmph! Nothing to write */
1400 goto end_io;
1401 __mark_buffer_clean(bh);
1402 break;
1403
1404 case READA:
1405 case READ:
1406 if (buffer_uptodate(bh))
1407 /* Hmmph! Already have it */
1408 goto end_io;
1409 break;
1410 default:
1411 BUG();
1412 end_io:
1413 bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1414 continue;
1415 }
1416
1417 submit_bh(rw, bh);
1418 }
1419 return;
1420
1421 sorry:
1422 /* Make sure we don't get infinite dirty retries.. */
1423 for (i = 0; i < nr; i++)
1424 mark_buffer_clean(bhs[i]);
1425 }
1426
1427 #ifdef CONFIG_STRAM_SWAP
1428 extern int stram_device_init (void);
1429 #endif
1430
blk_writeback_timer(unsigned long data)1431 static void blk_writeback_timer(unsigned long data)
1432 {
1433 wakeup_bdflush();
1434 wakeup_kupdate();
1435 }
1436
1437 /**
1438 * end_that_request_first - end I/O on one buffer.
1439 * @req: the request being processed
1440 * @uptodate: 0 for I/O error
1441 * @name: the name printed for an I/O error
1442 *
1443 * Description:
1444 * Ends I/O on the first buffer attached to @req, and sets it up
1445 * for the next buffer_head (if any) in the cluster.
1446 *
1447 * Return:
1448 * 0 - we are done with this request, call end_that_request_last()
1449 * 1 - still buffers pending for this request
1450 *
1451 * Caveat:
1452 * Drivers implementing their own end_request handling must call
1453 * blk_finished_io() appropriately.
1454 **/
1455
end_that_request_first(struct request * req,int uptodate,char * name)1456 int end_that_request_first (struct request *req, int uptodate, char *name)
1457 {
1458 struct buffer_head * bh;
1459 int nsect;
1460
1461 req->errors = 0;
1462 if (!uptodate)
1463 printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1464 kdevname(req->rq_dev), name, req->sector);
1465
1466 if ((bh = req->bh) != NULL) {
1467 nsect = bh->b_size >> 9;
1468 blk_finished_io(nsect);
1469 blk_finished_sectors(req, nsect);
1470 req->bh = bh->b_reqnext;
1471 bh->b_reqnext = NULL;
1472 bh->b_end_io(bh, uptodate);
1473 if ((bh = req->bh) != NULL) {
1474 req->hard_sector += nsect;
1475 req->hard_nr_sectors -= nsect;
1476 req->sector = req->hard_sector;
1477 req->nr_sectors = req->hard_nr_sectors;
1478
1479 req->current_nr_sectors = bh->b_size >> 9;
1480 req->hard_cur_sectors = req->current_nr_sectors;
1481 if (req->nr_sectors < req->current_nr_sectors) {
1482 req->nr_sectors = req->current_nr_sectors;
1483 printk("end_request: buffer-list destroyed\n");
1484 }
1485 req->buffer = bh->b_data;
1486 return 1;
1487 }
1488 }
1489 return 0;
1490 }
1491
1492 extern int laptop_mode;
1493
end_that_request_last(struct request * req)1494 void end_that_request_last(struct request *req)
1495 {
1496 struct completion *waiting = req->waiting;
1497
1498 /*
1499 * schedule the writeout of pending dirty data when the disk is idle
1500 */
1501 if (laptop_mode && req->cmd == READ)
1502 mod_timer(&writeback_timer, jiffies + 5 * HZ);
1503
1504 req_finished_io(req);
1505 blkdev_release_request(req);
1506 if (waiting)
1507 complete(waiting);
1508 }
1509
blk_dev_init(void)1510 int __init blk_dev_init(void)
1511 {
1512 struct blk_dev_struct *dev;
1513
1514 request_cachep = kmem_cache_create("blkdev_requests",
1515 sizeof(struct request),
1516 0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1517
1518 if (!request_cachep)
1519 panic("Can't create request pool slab cache\n");
1520
1521 for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1522 dev->queue = NULL;
1523
1524 memset(ro_bits,0,sizeof(ro_bits));
1525 memset(max_readahead, 0, sizeof(max_readahead));
1526 memset(max_sectors, 0, sizeof(max_sectors));
1527
1528 blk_max_low_pfn = max_low_pfn - 1;
1529 blk_max_pfn = max_pfn - 1;
1530
1531 init_timer(&writeback_timer);
1532 writeback_timer.function = blk_writeback_timer;
1533
1534 #ifdef CONFIG_AMIGA_Z2RAM
1535 z2_init();
1536 #endif
1537 #ifdef CONFIG_STRAM_SWAP
1538 stram_device_init();
1539 #endif
1540 #ifdef CONFIG_ISP16_CDI
1541 isp16_init();
1542 #endif
1543 #ifdef CONFIG_BLK_DEV_PS2
1544 ps2esdi_init();
1545 #endif
1546 #ifdef CONFIG_BLK_DEV_XD
1547 xd_init();
1548 #endif
1549 #ifdef CONFIG_BLK_DEV_MFM
1550 mfm_init();
1551 #endif
1552 #ifdef CONFIG_PARIDE
1553 { extern void paride_init(void); paride_init(); };
1554 #endif
1555 #ifdef CONFIG_MAC_FLOPPY
1556 swim3_init();
1557 #endif
1558 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1559 swimiop_init();
1560 #endif
1561 #ifdef CONFIG_AMIGA_FLOPPY
1562 amiga_floppy_init();
1563 #endif
1564 #ifdef CONFIG_ATARI_FLOPPY
1565 atari_floppy_init();
1566 #endif
1567 #ifdef CONFIG_BLK_DEV_FD
1568 floppy_init();
1569 #else
1570 #if defined(__i386__) /* Do we even need this? */
1571 outb_p(0xc, 0x3f2);
1572 #endif
1573 #endif
1574 #ifdef CONFIG_CDU31A
1575 cdu31a_init();
1576 #endif
1577 #ifdef CONFIG_ATARI_ACSI
1578 acsi_init();
1579 #endif
1580 #ifdef CONFIG_MCD
1581 mcd_init();
1582 #endif
1583 #ifdef CONFIG_MCDX
1584 mcdx_init();
1585 #endif
1586 #ifdef CONFIG_SBPCD
1587 sbpcd_init();
1588 #endif
1589 #ifdef CONFIG_AZTCD
1590 aztcd_init();
1591 #endif
1592 #ifdef CONFIG_CDU535
1593 sony535_init();
1594 #endif
1595 #ifdef CONFIG_GSCD
1596 gscd_init();
1597 #endif
1598 #ifdef CONFIG_CM206
1599 cm206_init();
1600 #endif
1601 #ifdef CONFIG_OPTCD
1602 optcd_init();
1603 #endif
1604 #ifdef CONFIG_SJCD
1605 sjcd_init();
1606 #endif
1607 #ifdef CONFIG_APBLOCK
1608 ap_init();
1609 #endif
1610 #ifdef CONFIG_DDV
1611 ddv_init();
1612 #endif
1613 #ifdef CONFIG_MDISK
1614 mdisk_init();
1615 #endif
1616 #ifdef CONFIG_DASD
1617 dasd_init();
1618 #endif
1619 #if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_BLOCK)
1620 tapeblock_init();
1621 #endif
1622 #ifdef CONFIG_BLK_DEV_XPRAM
1623 xpram_init();
1624 #endif
1625
1626 #ifdef CONFIG_SUN_JSFLASH
1627 jsfd_init();
1628 #endif
1629 return 0;
1630 };
1631
1632 EXPORT_SYMBOL(io_request_lock);
1633 EXPORT_SYMBOL(end_that_request_first);
1634 EXPORT_SYMBOL(end_that_request_last);
1635 EXPORT_SYMBOL(blk_grow_request_list);
1636 EXPORT_SYMBOL(blk_init_queue);
1637 EXPORT_SYMBOL(blk_get_queue);
1638 EXPORT_SYMBOL(blk_cleanup_queue);
1639 EXPORT_SYMBOL(blk_queue_headactive);
1640 EXPORT_SYMBOL(blk_queue_throttle_sectors);
1641 EXPORT_SYMBOL(blk_queue_make_request);
1642 EXPORT_SYMBOL(generic_make_request);
1643 EXPORT_SYMBOL(blkdev_release_request);
1644 EXPORT_SYMBOL(generic_unplug_device);
1645 EXPORT_SYMBOL(blk_queue_bounce_limit);
1646 EXPORT_SYMBOL(blk_max_low_pfn);
1647 EXPORT_SYMBOL(blk_max_pfn);
1648 EXPORT_SYMBOL(blk_seg_merge_ok);
1649 EXPORT_SYMBOL(blk_nohighio);
1650