1 /*
2  *  linux/drivers/block/ll_rw_blk.c
3  *
4  * Copyright (C) 1991, 1992 Linus Torvalds
5  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
6  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
7  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
8  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
9  */
10 
11 /*
12  * This handles all read/write requests to block devices
13  */
14 #include <linux/sched.h>
15 #include <linux/kernel.h>
16 #include <linux/kernel_stat.h>
17 #include <linux/errno.h>
18 #include <linux/string.h>
19 #include <linux/config.h>
20 #include <linux/locks.h>
21 #include <linux/mm.h>
22 #include <linux/swap.h>
23 #include <linux/init.h>
24 #include <linux/smp_lock.h>
25 #include <linux/completion.h>
26 #include <linux/bootmem.h>
27 
28 #include <asm/system.h>
29 #include <asm/io.h>
30 #include <linux/blk.h>
31 #include <linux/highmem.h>
32 #include <linux/slab.h>
33 #include <linux/module.h>
34 
35 /*
36  * MAC Floppy IWM hooks
37  */
38 
39 #ifdef CONFIG_MAC_FLOPPY_IWM
40 extern int mac_floppy_init(void);
41 #endif
42 
43 /*
44  * For the allocated request tables
45  */
46 static kmem_cache_t *request_cachep;
47 
48 /*
49  * The "disk" task queue is used to start the actual requests
50  * after a plug
51  */
52 DECLARE_TASK_QUEUE(tq_disk);
53 
54 /*
55  * Protect the request list against multiple users..
56  *
57  * With this spinlock the Linux block IO subsystem is 100% SMP threaded
58  * from the IRQ event side, and almost 100% SMP threaded from the syscall
59  * side (we still have protect against block device array operations, and
60  * the do_request() side is casually still unsafe. The kernel lock protects
61  * this part currently.).
62  *
63  * there is a fair chance that things will work just OK if these functions
64  * are called with no global kernel lock held ...
65  */
66 spinlock_t io_request_lock = SPIN_LOCK_UNLOCKED;
67 
68 /* This specifies how many sectors to read ahead on the disk. */
69 
70 int read_ahead[MAX_BLKDEV];
71 
72 /* blk_dev_struct is:
73  *	*request_fn
74  *	*current_request
75  */
76 struct blk_dev_struct blk_dev[MAX_BLKDEV]; /* initialized by blk_dev_init() */
77 
78 /*
79  * blk_size contains the size of all block-devices in units of 1024 byte
80  * sectors:
81  *
82  * blk_size[MAJOR][MINOR]
83  *
84  * if (!blk_size[MAJOR]) then no minor size checking is done.
85  */
86 int * blk_size[MAX_BLKDEV];
87 
88 /*
89  * blksize_size contains the size of all block-devices:
90  *
91  * blksize_size[MAJOR][MINOR]
92  *
93  * if (!blksize_size[MAJOR]) then 1024 bytes is assumed.
94  */
95 int * blksize_size[MAX_BLKDEV];
96 
97 /*
98  * hardsect_size contains the size of the hardware sector of a device.
99  *
100  * hardsect_size[MAJOR][MINOR]
101  *
102  * if (!hardsect_size[MAJOR])
103  *		then 512 bytes is assumed.
104  * else
105  *		sector_size is hardsect_size[MAJOR][MINOR]
106  * This is currently set by some scsi devices and read by the msdos fs driver.
107  * Other uses may appear later.
108  */
109 int * hardsect_size[MAX_BLKDEV];
110 
111 /*
112  * The following tunes the read-ahead algorithm in mm/filemap.c
113  */
114 int * max_readahead[MAX_BLKDEV];
115 
116 /*
117  * Max number of sectors per request
118  */
119 int * max_sectors[MAX_BLKDEV];
120 
121 unsigned long blk_max_low_pfn, blk_max_pfn;
122 int blk_nohighio = 0;
123 
124 int block_dump = 0;
125 
126 static struct timer_list writeback_timer;
127 
get_max_sectors(kdev_t dev)128 static inline int get_max_sectors(kdev_t dev)
129 {
130 	if (!max_sectors[MAJOR(dev)])
131 		return MAX_SECTORS;
132 	return max_sectors[MAJOR(dev)][MINOR(dev)];
133 }
134 
__blk_get_queue(kdev_t dev)135 static inline request_queue_t *__blk_get_queue(kdev_t dev)
136 {
137 	struct blk_dev_struct *bdev = blk_dev + MAJOR(dev);
138 
139 	if (bdev->queue)
140 		return bdev->queue(dev);
141 	else
142 		return &blk_dev[MAJOR(dev)].request_queue;
143 }
144 
blk_get_queue(kdev_t dev)145 request_queue_t *blk_get_queue(kdev_t dev)
146 {
147 	return __blk_get_queue(dev);
148 }
149 
__blk_cleanup_queue(struct request_list * list)150 static int __blk_cleanup_queue(struct request_list *list)
151 {
152 	struct list_head *head = &list->free;
153 	struct request *rq;
154 	int i = 0;
155 
156 	while (!list_empty(head)) {
157 		rq = list_entry(head->next, struct request, queue);
158 		list_del(&rq->queue);
159 		kmem_cache_free(request_cachep, rq);
160 		i++;
161 	};
162 
163 	if (i != list->count)
164 		printk("request list leak!\n");
165 
166 	list->count = 0;
167 	return i;
168 }
169 
170 /**
171  * blk_cleanup_queue: - release a &request_queue_t when it is no longer needed
172  * @q:    the request queue to be released
173  *
174  * Description:
175  *     blk_cleanup_queue is the pair to blk_init_queue().  It should
176  *     be called when a request queue is being released; typically
177  *     when a block device is being de-registered.  Currently, its
178  *     primary task it to free all the &struct request structures that
179  *     were allocated to the queue.
180  * Caveat:
181  *     Hopefully the low level driver will have finished any
182  *     outstanding requests first...
183  **/
blk_cleanup_queue(request_queue_t * q)184 void blk_cleanup_queue(request_queue_t * q)
185 {
186 	int count = q->nr_requests;
187 
188 	count -= __blk_cleanup_queue(&q->rq);
189 
190 	if (count)
191 		printk("blk_cleanup_queue: leaked requests (%d)\n", count);
192 	if (atomic_read(&q->nr_sectors))
193 		printk("blk_cleanup_queue: leaked sectors (%d)\n", atomic_read(&q->nr_sectors));
194 
195 	memset(q, 0, sizeof(*q));
196 }
197 
198 /**
199  * blk_queue_headactive - indicate whether head of request queue may be active
200  * @q:       The queue which this applies to.
201  * @active:  A flag indication where the head of the queue is active.
202  *
203  * Description:
204  *    The driver for a block device may choose to leave the currently active
205  *    request on the request queue, removing it only when it has completed.
206  *    The queue handling routines assume this by default for safety reasons
207  *    and will not involve the head of the request queue in any merging or
208  *    reordering of requests when the queue is unplugged (and thus may be
209  *    working on this particular request).
210  *
211  *    If a driver removes requests from the queue before processing them, then
212  *    it may indicate that it does so, there by allowing the head of the queue
213  *    to be involved in merging and reordering.  This is done be calling
214  *    blk_queue_headactive() with an @active flag of %0.
215  *
216  *    If a driver processes several requests at once, it must remove them (or
217  *    at least all but one of them) from the request queue.
218  *
219  *    When a queue is plugged the head will be assumed to be inactive.
220  **/
221 
blk_queue_headactive(request_queue_t * q,int active)222 void blk_queue_headactive(request_queue_t * q, int active)
223 {
224 	q->head_active = active;
225 }
226 
227 /**
228  * blk_queue_throttle_sectors - indicates you will call sector throttling funcs
229  * @q:       The queue which this applies to.
230  * @active:  A flag indication if you want sector throttling on
231  *
232  * Description:
233  * The sector throttling code allows us to put a limit on the number of
234  * sectors pending io to the disk at a given time, sending @active nonzero
235  * indicates you will call blk_started_sectors and blk_finished_sectors in
236  * addition to calling blk_started_io and blk_finished_io in order to
237  * keep track of the number of sectors in flight.
238  **/
239 
blk_queue_throttle_sectors(request_queue_t * q,int active)240 void blk_queue_throttle_sectors(request_queue_t * q, int active)
241 {
242 	q->can_throttle = active;
243 }
244 
245 /**
246  * blk_queue_make_request - define an alternate make_request function for a device
247  * @q:  the request queue for the device to be affected
248  * @mfn: the alternate make_request function
249  *
250  * Description:
251  *    The normal way for &struct buffer_heads to be passed to a device
252  *    driver is for them to be collected into requests on a request
253  *    queue, and then to allow the device driver to select requests
254  *    off that queue when it is ready.  This works well for many block
255  *    devices. However some block devices (typically virtual devices
256  *    such as md or lvm) do not benefit from the processing on the
257  *    request queue, and are served best by having the requests passed
258  *    directly to them.  This can be achieved by providing a function
259  *    to blk_queue_make_request().
260  *
261  * Caveat:
262  *    The driver that does this *must* be able to deal appropriately
263  *    with buffers in "highmemory", either by calling bh_kmap() to get
264  *    a kernel mapping, to by calling create_bounce() to create a
265  *    buffer in normal memory.
266  **/
267 
blk_queue_make_request(request_queue_t * q,make_request_fn * mfn)268 void blk_queue_make_request(request_queue_t * q, make_request_fn * mfn)
269 {
270 	q->make_request_fn = mfn;
271 }
272 
273 /**
274  * blk_queue_bounce_limit - set bounce buffer limit for queue
275  * @q:  the request queue for the device
276  * @dma_addr:   bus address limit
277  *
278  * Description:
279  *    Different hardware can have different requirements as to what pages
280  *    it can do I/O directly to. A low level driver can call
281  *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
282  *    buffers for doing I/O to pages residing above @page. By default
283  *    the block layer sets this to the highest numbered "low" memory page.
284  **/
blk_queue_bounce_limit(request_queue_t * q,u64 dma_addr)285 void blk_queue_bounce_limit(request_queue_t *q, u64 dma_addr)
286 {
287 	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
288 
289 	q->bounce_pfn = bounce_pfn;
290 }
291 
292 
293 /*
294  * can we merge the two segments, or do we need to start a new one?
295  */
__blk_seg_merge_ok(struct buffer_head * bh,struct buffer_head * nxt)296 static inline int __blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt)
297 {
298 	/*
299 	 * if bh and nxt are contigous and don't cross a 4g boundary, it's ok
300 	 */
301 	if (BH_CONTIG(bh, nxt) && BH_PHYS_4G(bh, nxt))
302 		return 1;
303 
304 	return 0;
305 }
306 
blk_seg_merge_ok(struct buffer_head * bh,struct buffer_head * nxt)307 int blk_seg_merge_ok(struct buffer_head *bh, struct buffer_head *nxt)
308 {
309 	return __blk_seg_merge_ok(bh, nxt);
310 }
311 
ll_new_segment(request_queue_t * q,struct request * req,int max_segments)312 static inline int ll_new_segment(request_queue_t *q, struct request *req, int max_segments)
313 {
314 	if (req->nr_segments < max_segments) {
315 		req->nr_segments++;
316 		return 1;
317 	}
318 	return 0;
319 }
320 
ll_back_merge_fn(request_queue_t * q,struct request * req,struct buffer_head * bh,int max_segments)321 static int ll_back_merge_fn(request_queue_t *q, struct request *req,
322 			    struct buffer_head *bh, int max_segments)
323 {
324 	if (__blk_seg_merge_ok(req->bhtail, bh))
325 		return 1;
326 
327 	return ll_new_segment(q, req, max_segments);
328 }
329 
ll_front_merge_fn(request_queue_t * q,struct request * req,struct buffer_head * bh,int max_segments)330 static int ll_front_merge_fn(request_queue_t *q, struct request *req,
331 			     struct buffer_head *bh, int max_segments)
332 {
333 	if (__blk_seg_merge_ok(bh, req->bh))
334 		return 1;
335 
336 	return ll_new_segment(q, req, max_segments);
337 }
338 
ll_merge_requests_fn(request_queue_t * q,struct request * req,struct request * next,int max_segments)339 static int ll_merge_requests_fn(request_queue_t *q, struct request *req,
340 				struct request *next, int max_segments)
341 {
342 	int total_segments = req->nr_segments + next->nr_segments;
343 
344 	if (__blk_seg_merge_ok(req->bhtail, next->bh))
345 		total_segments--;
346 
347 	if (total_segments > max_segments)
348 		return 0;
349 
350 	req->nr_segments = total_segments;
351 	return 1;
352 }
353 
354 /*
355  * "plug" the device if there are no outstanding requests: this will
356  * force the transfer to start only after we have put all the requests
357  * on the list.
358  *
359  * This is called with interrupts off and no requests on the queue.
360  * (and with the request spinlock acquired)
361  */
generic_plug_device(request_queue_t * q,kdev_t dev)362 static void generic_plug_device(request_queue_t *q, kdev_t dev)
363 {
364 	/*
365 	 * no need to replug device
366 	 */
367 	if (!list_empty(&q->queue_head) || q->plugged)
368 		return;
369 
370 	q->plugged = 1;
371 	queue_task(&q->plug_tq, &tq_disk);
372 }
373 
374 /*
375  * remove the plug and let it rip..
376  */
__generic_unplug_device(request_queue_t * q)377 static inline void __generic_unplug_device(request_queue_t *q)
378 {
379 	if (q->plugged) {
380 		q->plugged = 0;
381 		if (!list_empty(&q->queue_head))
382 			q->request_fn(q);
383 	}
384 }
385 
generic_unplug_device(void * data)386 void generic_unplug_device(void *data)
387 {
388 	request_queue_t *q = (request_queue_t *) data;
389 	unsigned long flags;
390 
391 	spin_lock_irqsave(&io_request_lock, flags);
392 	__generic_unplug_device(q);
393 	spin_unlock_irqrestore(&io_request_lock, flags);
394 }
395 
396 /** blk_grow_request_list
397  *  @q: The &request_queue_t
398  *  @nr_requests: how many requests are desired
399  *
400  * More free requests are added to the queue's free lists, bringing
401  * the total number of requests to @nr_requests.
402  *
403  * The requests are added equally to the request queue's read
404  * and write freelists.
405  *
406  * This function can sleep.
407  *
408  * Returns the (new) number of requests which the queue has available.
409  */
blk_grow_request_list(request_queue_t * q,int nr_requests,int max_queue_sectors)410 int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors)
411 {
412 	unsigned long flags;
413 	/* Several broken drivers assume that this function doesn't sleep,
414 	 * this causes system hangs during boot.
415 	 * As a temporary fix, make the function non-blocking.
416 	 */
417 	spin_lock_irqsave(&io_request_lock, flags);
418 	while (q->nr_requests < nr_requests) {
419 		struct request *rq;
420 
421 		rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC);
422 		if (rq == NULL)
423 			break;
424 		memset(rq, 0, sizeof(*rq));
425 		rq->rq_status = RQ_INACTIVE;
426  		list_add(&rq->queue, &q->rq.free);
427  		q->rq.count++;
428 
429 		q->nr_requests++;
430 	}
431 
432  	/*
433  	 * Wakeup waiters after both one quarter of the
434  	 * max-in-fligh queue and one quarter of the requests
435  	 * are available again.
436  	 */
437 
438 	q->batch_requests = q->nr_requests / 4;
439 	if (q->batch_requests > 32)
440 		q->batch_requests = 32;
441  	q->batch_sectors = max_queue_sectors / 4;
442 
443  	q->max_queue_sectors = max_queue_sectors;
444 
445  	BUG_ON(!q->batch_sectors);
446  	atomic_set(&q->nr_sectors, 0);
447 
448 	spin_unlock_irqrestore(&io_request_lock, flags);
449 	return q->nr_requests;
450 }
451 
blk_init_free_list(request_queue_t * q)452 static void blk_init_free_list(request_queue_t *q)
453 {
454 	struct sysinfo si;
455 	int megs;		/* Total memory, in megabytes */
456  	int nr_requests, max_queue_sectors = MAX_QUEUE_SECTORS;
457 
458  	INIT_LIST_HEAD(&q->rq.free);
459 	q->rq.count = 0;
460 	q->rq.pending[READ] = q->rq.pending[WRITE] = 0;
461 	q->nr_requests = 0;
462 
463 	si_meminfo(&si);
464 	megs = si.totalram >> (20 - PAGE_SHIFT);
465  	nr_requests = MAX_NR_REQUESTS;
466  	if (megs < 30) {
467   		nr_requests /= 2;
468  		max_queue_sectors /= 2;
469  	}
470  	/* notice early if anybody screwed the defaults */
471  	BUG_ON(!nr_requests);
472  	BUG_ON(!max_queue_sectors);
473 
474  	blk_grow_request_list(q, nr_requests, max_queue_sectors);
475 
476  	init_waitqueue_head(&q->wait_for_requests);
477 
478 	spin_lock_init(&q->queue_lock);
479 }
480 
481 static int __make_request(request_queue_t * q, int rw, struct buffer_head * bh);
482 
483 /**
484  * blk_init_queue  - prepare a request queue for use with a block device
485  * @q:    The &request_queue_t to be initialised
486  * @rfn:  The function to be called to process requests that have been
487  *        placed on the queue.
488  *
489  * Description:
490  *    If a block device wishes to use the standard request handling procedures,
491  *    which sorts requests and coalesces adjacent requests, then it must
492  *    call blk_init_queue().  The function @rfn will be called when there
493  *    are requests on the queue that need to be processed.  If the device
494  *    supports plugging, then @rfn may not be called immediately when requests
495  *    are available on the queue, but may be called at some time later instead.
496  *    Plugged queues are generally unplugged when a buffer belonging to one
497  *    of the requests on the queue is needed, or due to memory pressure.
498  *
499  *    @rfn is not required, or even expected, to remove all requests off the
500  *    queue, but only as many as it can handle at a time.  If it does leave
501  *    requests on the queue, it is responsible for arranging that the requests
502  *    get dealt with eventually.
503  *
504  *    A global spin lock $io_request_lock must be held while manipulating the
505  *    requests on the request queue.
506  *
507  *    The request on the head of the queue is by default assumed to be
508  *    potentially active, and it is not considered for re-ordering or merging
509  *    whenever the given queue is unplugged. This behaviour can be changed with
510  *    blk_queue_headactive().
511  *
512  * Note:
513  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
514  *    when the block device is deactivated (such as at module unload).
515  **/
blk_init_queue(request_queue_t * q,request_fn_proc * rfn)516 void blk_init_queue(request_queue_t * q, request_fn_proc * rfn)
517 {
518 	INIT_LIST_HEAD(&q->queue_head);
519 	elevator_init(&q->elevator, ELEVATOR_LINUS);
520 	blk_init_free_list(q);
521 	q->request_fn     	= rfn;
522 	q->back_merge_fn       	= ll_back_merge_fn;
523 	q->front_merge_fn      	= ll_front_merge_fn;
524 	q->merge_requests_fn	= ll_merge_requests_fn;
525 	q->make_request_fn	= __make_request;
526 	q->plug_tq.sync		= 0;
527 	q->plug_tq.routine	= &generic_unplug_device;
528 	q->plug_tq.data		= q;
529 	q->plugged        	= 0;
530 	q->can_throttle		= 0;
531 
532 	/*
533 	 * These booleans describe the queue properties.  We set the
534 	 * default (and most common) values here.  Other drivers can
535 	 * use the appropriate functions to alter the queue properties.
536 	 * as appropriate.
537 	 */
538 	q->plug_device_fn 	= generic_plug_device;
539 	q->head_active    	= 1;
540 
541 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
542 }
543 
544 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queue);
545 /*
546  * Get a free request. io_request_lock must be held and interrupts
547  * disabled on the way in.  Returns NULL if there are no free requests.
548  */
get_request(request_queue_t * q,int rw)549 static struct request *get_request(request_queue_t *q, int rw)
550 {
551 	struct request *rq = NULL;
552 	struct request_list *rl = &q->rq;
553 
554 	if (blk_oversized_queue(q)) {
555 		int rlim = q->nr_requests >> 5;
556 
557 		if (rlim < 4)
558 			rlim = 4;
559 
560 		/*
561 		 * if its a write, or we have more than a handful of reads
562 		 * pending, bail out
563 		 */
564 		if ((rw == WRITE) || (rw == READ && rl->pending[READ] > rlim))
565 			return NULL;
566 		if (blk_oversized_queue_reads(q))
567 			return NULL;
568 	}
569 
570 	if (!list_empty(&rl->free)) {
571 		rq = blkdev_free_rq(&rl->free);
572 		list_del(&rq->queue);
573 		rl->count--;
574 		rl->pending[rw]++;
575 		rq->rq_status = RQ_ACTIVE;
576 		rq->cmd = rw;
577 		rq->special = NULL;
578 		rq->io_account = 0;
579 		rq->q = q;
580 	}
581 
582 	return rq;
583 }
584 
585 /*
586  * Here's the request allocation design, low latency version:
587  *
588  * 1: Blocking on request exhaustion is a key part of I/O throttling.
589  *
590  * 2: We want to be `fair' to all requesters.  We must avoid starvation, and
591  *    attempt to ensure that all requesters sleep for a similar duration.  Hence
592  *    no stealing requests when there are other processes waiting.
593  *
594  * There used to be more here, attempting to allow a process to send in a
595  * number of requests once it has woken up.  But, there's no way to
596  * tell if a process has just been woken up, or if it is a new process
597  * coming in to steal requests from the waiters.  So, we give up and force
598  * everyone to wait fairly.
599  *
600  * So here's what we do:
601  *
602  *    a) A READA requester fails if free_requests < batch_requests
603  *
604  *       We don't want READA requests to prevent sleepers from ever
605  *       waking.  Note that READA is used extremely rarely - a few
606  *       filesystems use it for directory readahead.
607  *
608  *  When a process wants a new request:
609  *
610  *    b) If free_requests == 0, the requester sleeps in FIFO manner, and
611  *       the queue full condition is set.  The full condition is not
612  *       cleared until there are no longer any waiters.  Once the full
613  *       condition is set, all new io must wait, hopefully for a very
614  *       short period of time.
615  *
616  *  When a request is released:
617  *
618  *    c) If free_requests < batch_requests, do nothing.
619  *
620  *    d) If free_requests >= batch_requests, wake up a single waiter.
621  *
622  *   As each waiter gets a request, he wakes another waiter.  We do this
623  *   to prevent a race where an unplug might get run before a request makes
624  *   it's way onto the queue.  The result is a cascade of wakeups, so delaying
625  *   the initial wakeup until we've got batch_requests available helps avoid
626  *   wakeups where there aren't any requests available yet.
627  */
628 
__get_request_wait(request_queue_t * q,int rw)629 static struct request *__get_request_wait(request_queue_t *q, int rw)
630 {
631 	register struct request *rq;
632 	DECLARE_WAITQUEUE(wait, current);
633 
634 	add_wait_queue_exclusive(&q->wait_for_requests, &wait);
635 
636 	do {
637 		set_current_state(TASK_UNINTERRUPTIBLE);
638 		spin_lock_irq(&io_request_lock);
639 		if (blk_oversized_queue(q) || q->rq.count == 0) {
640 			__generic_unplug_device(q);
641 			spin_unlock_irq(&io_request_lock);
642 			schedule();
643 			spin_lock_irq(&io_request_lock);
644 		}
645 		rq = get_request(q, rw);
646 		spin_unlock_irq(&io_request_lock);
647 	} while (rq == NULL);
648 	remove_wait_queue(&q->wait_for_requests, &wait);
649 	current->state = TASK_RUNNING;
650 
651 	return rq;
652 }
653 
get_request_wait_wakeup(request_queue_t * q,int rw)654 static void get_request_wait_wakeup(request_queue_t *q, int rw)
655 {
656 	/*
657 	 * avoid losing an unplug if a second __get_request_wait did the
658 	 * generic_unplug_device while our __get_request_wait was running
659 	 * w/o the queue_lock held and w/ our request out of the queue.
660 	 */
661 	if (waitqueue_active(&q->wait_for_requests))
662 		wake_up(&q->wait_for_requests);
663 }
664 
665 /* RO fail safe mechanism */
666 
667 static long ro_bits[MAX_BLKDEV][8];
668 
is_read_only(kdev_t dev)669 int is_read_only(kdev_t dev)
670 {
671 	int minor,major;
672 
673 	major = MAJOR(dev);
674 	minor = MINOR(dev);
675 	if (major < 0 || major >= MAX_BLKDEV) return 0;
676 	return ro_bits[major][minor >> 5] & (1 << (minor & 31));
677 }
678 
set_device_ro(kdev_t dev,int flag)679 void set_device_ro(kdev_t dev,int flag)
680 {
681 	int minor,major;
682 
683 	major = MAJOR(dev);
684 	minor = MINOR(dev);
685 	if (major < 0 || major >= MAX_BLKDEV) return;
686 	if (flag) ro_bits[major][minor >> 5] |= 1 << (minor & 31);
687 	else ro_bits[major][minor >> 5] &= ~(1 << (minor & 31));
688 }
689 
drive_stat_acct(kdev_t dev,int rw,unsigned long nr_sectors,int new_io)690 inline void drive_stat_acct (kdev_t dev, int rw,
691 				unsigned long nr_sectors, int new_io)
692 {
693 	unsigned int major = MAJOR(dev);
694 	unsigned int index;
695 
696 	index = disk_index(dev);
697 	if ((index >= DK_MAX_DISK) || (major >= DK_MAX_MAJOR))
698 		return;
699 
700 	kstat.dk_drive[major][index] += new_io;
701 	if (rw == READ) {
702 		kstat.dk_drive_rio[major][index] += new_io;
703 		kstat.dk_drive_rblk[major][index] += nr_sectors;
704 	} else if (rw == WRITE) {
705 		kstat.dk_drive_wio[major][index] += new_io;
706 		kstat.dk_drive_wblk[major][index] += nr_sectors;
707 	} else
708 		printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n");
709 }
710 
711 #ifdef CONFIG_BLK_STATS
712 /*
713  * Return up to two hd_structs on which to do IO accounting for a given
714  * request.
715  *
716  * On a partitioned device, we want to account both against the partition
717  * and against the whole disk.
718  */
locate_hd_struct(struct request * req,struct hd_struct ** hd1,struct hd_struct ** hd2)719 static void locate_hd_struct(struct request *req,
720 			     struct hd_struct **hd1,
721 			     struct hd_struct **hd2)
722 {
723 	struct gendisk *gd;
724 
725 	*hd1 = NULL;
726 	*hd2 = NULL;
727 
728 	gd = get_gendisk(req->rq_dev);
729 	if (gd && gd->part) {
730 		/* Mask out the partition bits: account for the entire disk */
731 		int devnr = MINOR(req->rq_dev) >> gd->minor_shift;
732 		int whole_minor = devnr << gd->minor_shift;
733 
734 		*hd1 = &gd->part[whole_minor];
735 		if (whole_minor != MINOR(req->rq_dev))
736 			*hd2= &gd->part[MINOR(req->rq_dev)];
737 	}
738 }
739 
740 /*
741  * Round off the performance stats on an hd_struct.
742  *
743  * The average IO queue length and utilisation statistics are maintained
744  * by observing the current state of the queue length and the amount of
745  * time it has been in this state for.
746  * Normally, that accounting is done on IO completion, but that can result
747  * in more than a second's worth of IO being accounted for within any one
748  * second, leading to >100% utilisation.  To deal with that, we do a
749  * round-off before returning the results when reading /proc/partitions,
750  * accounting immediately for all queue usage up to the current jiffies and
751  * restarting the counters again.
752  */
disk_round_stats(struct hd_struct * hd)753 void disk_round_stats(struct hd_struct *hd)
754 {
755 	unsigned long now = jiffies;
756 
757 	hd->aveq += (hd->ios_in_flight * (jiffies - hd->last_queue_change));
758 	hd->last_queue_change = now;
759 
760 	if (hd->ios_in_flight)
761 		hd->io_ticks += (now - hd->last_idle_time);
762 	hd->last_idle_time = now;
763 }
764 
down_ios(struct hd_struct * hd)765 static inline void down_ios(struct hd_struct *hd)
766 {
767 	disk_round_stats(hd);
768 	--hd->ios_in_flight;
769 }
770 
up_ios(struct hd_struct * hd)771 static inline void up_ios(struct hd_struct *hd)
772 {
773 	disk_round_stats(hd);
774 	++hd->ios_in_flight;
775 }
776 
account_io_start(struct hd_struct * hd,struct request * req,int merge,int sectors)777 static void account_io_start(struct hd_struct *hd, struct request *req,
778 			     int merge, int sectors)
779 {
780 	switch (req->cmd) {
781 	case READ:
782 		if (merge)
783 			hd->rd_merges++;
784 		hd->rd_sectors += sectors;
785 		break;
786 	case WRITE:
787 		if (merge)
788 			hd->wr_merges++;
789 		hd->wr_sectors += sectors;
790 		break;
791 	}
792 	if (!merge)
793 		up_ios(hd);
794 }
795 
account_io_end(struct hd_struct * hd,struct request * req)796 static void account_io_end(struct hd_struct *hd, struct request *req)
797 {
798 	unsigned long duration = jiffies - req->start_time;
799 	switch (req->cmd) {
800 	case READ:
801 		hd->rd_ticks += duration;
802 		hd->rd_ios++;
803 		break;
804 	case WRITE:
805 		hd->wr_ticks += duration;
806 		hd->wr_ios++;
807 		break;
808 	}
809 	down_ios(hd);
810 }
811 
req_new_io(struct request * req,int merge,int sectors)812 void req_new_io(struct request *req, int merge, int sectors)
813 {
814 	struct hd_struct *hd1, *hd2;
815 
816 	locate_hd_struct(req, &hd1, &hd2);
817 	req->io_account = 1;
818 	if (hd1)
819 		account_io_start(hd1, req, merge, sectors);
820 	if (hd2)
821 		account_io_start(hd2, req, merge, sectors);
822 }
823 
req_merged_io(struct request * req)824 void req_merged_io(struct request *req)
825 {
826 	struct hd_struct *hd1, *hd2;
827 
828 	if (unlikely(req->io_account == 0))
829 		return;
830 	locate_hd_struct(req, &hd1, &hd2);
831 	if (hd1)
832 		down_ios(hd1);
833 	if (hd2)
834 		down_ios(hd2);
835 }
836 
req_finished_io(struct request * req)837 void req_finished_io(struct request *req)
838 {
839 	struct hd_struct *hd1, *hd2;
840 
841 	if (unlikely(req->io_account == 0))
842 		return;
843 	locate_hd_struct(req, &hd1, &hd2);
844 	if (hd1)
845 		account_io_end(hd1, req);
846 	if (hd2)
847 		account_io_end(hd2, req);
848 }
849 EXPORT_SYMBOL(req_finished_io);
850 #endif /* CONFIG_BLK_STATS */
851 
852 /*
853  * add-request adds a request to the linked list.
854  * io_request_lock is held and interrupts disabled, as we muck with the
855  * request queue list.
856  *
857  * By this point, req->cmd is always either READ/WRITE, never READA,
858  * which is important for drive_stat_acct() above.
859  */
add_request(request_queue_t * q,struct request * req,struct list_head * insert_here)860 static inline void add_request(request_queue_t * q, struct request * req,
861 			       struct list_head *insert_here)
862 {
863 	drive_stat_acct(req->rq_dev, req->cmd, req->nr_sectors, 1);
864 
865 	if (!q->plugged && q->head_active && insert_here == &q->queue_head) {
866 		spin_unlock_irq(&io_request_lock);
867 		BUG();
868 	}
869 
870 	/*
871 	 * elevator indicated where it wants this request to be
872 	 * inserted at elevator_merge time
873 	 */
874 	list_add(&req->queue, insert_here);
875 }
876 
877 /*
878  * Must be called with io_request_lock held and interrupts disabled
879  */
blkdev_release_request(struct request * req)880 void blkdev_release_request(struct request *req)
881 {
882 	request_queue_t *q = req->q;
883 
884 	req->rq_status = RQ_INACTIVE;
885 	req->q = NULL;
886 
887 	/*
888 	 * Request may not have originated from ll_rw_blk. if not,
889 	 * assume it has free buffers and check waiters
890 	 */
891 	if (q) {
892 		struct request_list *rl = &q->rq;
893 		int oversized_batch = 0;
894 
895 		if (q->can_throttle)
896 			oversized_batch = blk_oversized_queue_batch(q);
897 		rl->count++;
898 		/*
899 		 * paranoia check
900 		 */
901 		if (req->cmd == READ || req->cmd == WRITE)
902 			rl->pending[req->cmd]--;
903 		if (rl->pending[READ] > q->nr_requests)
904 			printk("blk: reads: %u\n", rl->pending[READ]);
905 		if (rl->pending[WRITE] > q->nr_requests)
906 			printk("blk: writes: %u\n", rl->pending[WRITE]);
907 		if (rl->pending[READ] + rl->pending[WRITE] > q->nr_requests)
908 			printk("blk: r/w: %u + %u > %u\n", rl->pending[READ], rl->pending[WRITE], q->nr_requests);
909 		list_add(&req->queue, &rl->free);
910 		if (rl->count >= q->batch_requests && !oversized_batch) {
911 			smp_mb();
912 			if (waitqueue_active(&q->wait_for_requests))
913 				wake_up(&q->wait_for_requests);
914 		}
915 	}
916 }
917 
918 /*
919  * Has to be called with the request spinlock acquired
920  */
attempt_merge(request_queue_t * q,struct request * req,int max_sectors,int max_segments)921 static void attempt_merge(request_queue_t * q,
922 			  struct request *req,
923 			  int max_sectors,
924 			  int max_segments)
925 {
926 	struct request *next;
927 
928 	next = blkdev_next_request(req);
929 	if (req->sector + req->nr_sectors != next->sector)
930 		return;
931 	if (req->cmd != next->cmd
932 	    || req->rq_dev != next->rq_dev
933 	    || req->nr_sectors + next->nr_sectors > max_sectors
934 	    || next->waiting)
935 		return;
936 	/*
937 	 * If we are not allowed to merge these requests, then
938 	 * return.  If we are allowed to merge, then the count
939 	 * will have been updated to the appropriate number,
940 	 * and we shouldn't do it here too.
941 	 */
942 	if (!q->merge_requests_fn(q, req, next, max_segments))
943 		return;
944 
945 	q->elevator.elevator_merge_req_fn(req, next);
946 
947 	/* At this point we have either done a back merge
948 	 * or front merge. We need the smaller start_time of
949 	 * the merged requests to be the current request
950 	 * for accounting purposes.
951 	 */
952 	if (time_after(req->start_time, next->start_time))
953 		req->start_time = next->start_time;
954 
955 	req->bhtail->b_reqnext = next->bh;
956 	req->bhtail = next->bhtail;
957 	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
958 	list_del(&next->queue);
959 
960 	/* One last thing: we have removed a request, so we now have one
961 	   less expected IO to complete for accounting purposes. */
962 	req_merged_io(req);
963 
964 	blkdev_release_request(next);
965 }
966 
attempt_back_merge(request_queue_t * q,struct request * req,int max_sectors,int max_segments)967 static inline void attempt_back_merge(request_queue_t * q,
968 				      struct request *req,
969 				      int max_sectors,
970 				      int max_segments)
971 {
972 	if (&req->queue == q->queue_head.prev)
973 		return;
974 	attempt_merge(q, req, max_sectors, max_segments);
975 }
976 
attempt_front_merge(request_queue_t * q,struct list_head * head,struct request * req,int max_sectors,int max_segments)977 static inline void attempt_front_merge(request_queue_t * q,
978 				       struct list_head * head,
979 				       struct request *req,
980 				       int max_sectors,
981 				       int max_segments)
982 {
983 	struct list_head * prev;
984 
985 	prev = req->queue.prev;
986 	if (head == prev)
987 		return;
988 	attempt_merge(q, blkdev_entry_to_request(prev), max_sectors, max_segments);
989 }
990 
__make_request(request_queue_t * q,int rw,struct buffer_head * bh)991 static int __make_request(request_queue_t * q, int rw,
992 				  struct buffer_head * bh)
993 {
994 	unsigned int sector, count, sync;
995 	int max_segments = MAX_SEGMENTS;
996 	struct request * req, *freereq = NULL;
997 	int rw_ahead, max_sectors, el_ret;
998 	struct list_head *head, *insert_here;
999 	int latency;
1000 	elevator_t *elevator = &q->elevator;
1001 	int should_wake = 0;
1002 
1003 	count = bh->b_size >> 9;
1004 	sector = bh->b_rsector;
1005 	sync = test_and_clear_bit(BH_Sync, &bh->b_state);
1006 
1007 	rw_ahead = 0;	/* normal case; gets changed below for READA */
1008 	switch (rw) {
1009 		case READA:
1010 #if 0	/* bread() misinterprets failed READA attempts as IO errors on SMP */
1011 			rw_ahead = 1;
1012 #endif
1013 			rw = READ;	/* drop into READ */
1014 		case READ:
1015 		case WRITE:
1016 			latency = elevator_request_latency(elevator, rw);
1017 			break;
1018 		default:
1019 			BUG();
1020 			goto end_io;
1021 	}
1022 
1023 	/* We'd better have a real physical mapping!
1024 	   Check this bit only if the buffer was dirty and just locked
1025 	   down by us so at this point flushpage will block and
1026 	   won't clear the mapped bit under us. */
1027 	if (!buffer_mapped(bh))
1028 		BUG();
1029 
1030 	/*
1031 	 * Temporary solution - in 2.5 this will be done by the lowlevel
1032 	 * driver. Create a bounce buffer if the buffer data points into
1033 	 * high memory - keep the original buffer otherwise.
1034 	 */
1035 	bh = blk_queue_bounce(q, rw, bh);
1036 
1037 /* look for a free request. */
1038 	/*
1039 	 * Try to coalesce the new request with old requests
1040 	 */
1041 	max_sectors = get_max_sectors(bh->b_rdev);
1042 
1043 	req = NULL;
1044 	head = &q->queue_head;
1045 	/*
1046 	 * Now we acquire the request spinlock, we have to be mega careful
1047 	 * not to schedule or do something nonatomic
1048 	 */
1049 	spin_lock_irq(&io_request_lock);
1050 
1051 again:
1052 	insert_here = head->prev;
1053 
1054 	if (list_empty(head)) {
1055 		q->plug_device_fn(q, bh->b_rdev); /* is atomic */
1056 		goto get_rq;
1057 	} else if (q->head_active && !q->plugged)
1058 		head = head->next;
1059 
1060 	el_ret = elevator->elevator_merge_fn(q, &req, head, bh, rw,max_sectors);
1061 	switch (el_ret) {
1062 
1063 		case ELEVATOR_BACK_MERGE:
1064 			if (!q->back_merge_fn(q, req, bh, max_segments)) {
1065 				insert_here = &req->queue;
1066 				break;
1067 			}
1068 			req->bhtail->b_reqnext = bh;
1069 			req->bhtail = bh;
1070 			req->nr_sectors = req->hard_nr_sectors += count;
1071 			blk_started_io(count);
1072 			blk_started_sectors(req, count);
1073 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
1074 			req_new_io(req, 1, count);
1075 			attempt_back_merge(q, req, max_sectors, max_segments);
1076 			goto out;
1077 
1078 		case ELEVATOR_FRONT_MERGE:
1079 			if (!q->front_merge_fn(q, req, bh, max_segments)) {
1080 				insert_here = req->queue.prev;
1081 				break;
1082 			}
1083 			bh->b_reqnext = req->bh;
1084 			req->bh = bh;
1085 			/*
1086 			 * may not be valid, but queues not having bounce
1087 			 * enabled for highmem pages must not look at
1088 			 * ->buffer anyway
1089 			 */
1090 			req->buffer = bh->b_data;
1091 			req->current_nr_sectors = req->hard_cur_sectors = count;
1092 			req->sector = req->hard_sector = sector;
1093 			req->nr_sectors = req->hard_nr_sectors += count;
1094 			blk_started_io(count);
1095 			blk_started_sectors(req, count);
1096 			drive_stat_acct(req->rq_dev, req->cmd, count, 0);
1097 			req_new_io(req, 1, count);
1098 			attempt_front_merge(q, head, req, max_sectors, max_segments);
1099 			goto out;
1100 
1101 		/*
1102 		 * elevator says don't/can't merge. get new request
1103 		 */
1104 		case ELEVATOR_NO_MERGE:
1105 			/*
1106 			 * use elevator hints as to where to insert the
1107 			 * request. if no hints, just add it to the back
1108 			 * of the queue
1109 			 */
1110 			if (req)
1111 				insert_here = &req->queue;
1112 			break;
1113 
1114 		default:
1115 			printk("elevator returned crap (%d)\n", el_ret);
1116 			BUG();
1117 	}
1118 
1119 get_rq:
1120 	if (freereq) {
1121 		req = freereq;
1122 		freereq = NULL;
1123 	} else {
1124 		/*
1125 		 * See description above __get_request_wait()
1126 		 */
1127 		if (rw_ahead) {
1128 			if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) {
1129 				spin_unlock_irq(&io_request_lock);
1130 				goto end_io;
1131 			}
1132 			req = get_request(q, rw);
1133 			if (req == NULL)
1134 				BUG();
1135 		} else {
1136 			req = get_request(q, rw);
1137 			if (req == NULL) {
1138 				spin_unlock_irq(&io_request_lock);
1139 				freereq = __get_request_wait(q, rw);
1140 				head = &q->queue_head;
1141 				spin_lock_irq(&io_request_lock);
1142 				should_wake = 1;
1143 				goto again;
1144 			}
1145 		}
1146 	}
1147 
1148 /* fill up the request-info, and add it to the queue */
1149 	req->elevator_sequence = latency;
1150 	req->cmd = rw;
1151 	req->errors = 0;
1152 	req->hard_sector = req->sector = sector;
1153 	req->hard_nr_sectors = req->nr_sectors = count;
1154 	req->current_nr_sectors = req->hard_cur_sectors = count;
1155 	req->nr_segments = 1; /* Always 1 for a new request. */
1156 	req->nr_hw_segments = 1; /* Always 1 for a new request. */
1157 	req->buffer = bh->b_data;
1158 	req->waiting = NULL;
1159 	req->bh = bh;
1160 	req->bhtail = bh;
1161 	req->rq_dev = bh->b_rdev;
1162 	req->start_time = jiffies;
1163 	req_new_io(req, 0, count);
1164 	blk_started_io(count);
1165 	blk_started_sectors(req, count);
1166 	add_request(q, req, insert_here);
1167 out:
1168 	if (freereq)
1169 		blkdev_release_request(freereq);
1170 	if (should_wake)
1171 		get_request_wait_wakeup(q, rw);
1172 	if (sync)
1173 		__generic_unplug_device(q);
1174 	spin_unlock_irq(&io_request_lock);
1175 	return 0;
1176 end_io:
1177 	bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1178 	return 0;
1179 }
1180 
1181 /**
1182  * generic_make_request: hand a buffer head to it's device driver for I/O
1183  * @rw:  READ, WRITE, or READA - what sort of I/O is desired.
1184  * @bh:  The buffer head describing the location in memory and on the device.
1185  *
1186  * generic_make_request() is used to make I/O requests of block
1187  * devices. It is passed a &struct buffer_head and a &rw value.  The
1188  * %READ and %WRITE options are (hopefully) obvious in meaning.  The
1189  * %READA value means that a read is required, but that the driver is
1190  * free to fail the request if, for example, it cannot get needed
1191  * resources immediately.
1192  *
1193  * generic_make_request() does not return any status.  The
1194  * success/failure status of the request, along with notification of
1195  * completion, is delivered asynchronously through the bh->b_end_io
1196  * function described (one day) else where.
1197  *
1198  * The caller of generic_make_request must make sure that b_page,
1199  * b_addr, b_size are set to describe the memory buffer, that b_rdev
1200  * and b_rsector are set to describe the device address, and the
1201  * b_end_io and optionally b_private are set to describe how
1202  * completion notification should be signaled.  BH_Mapped should also
1203  * be set (to confirm that b_dev and b_blocknr are valid).
1204  *
1205  * generic_make_request and the drivers it calls may use b_reqnext,
1206  * and may change b_rdev and b_rsector.  So the values of these fields
1207  * should NOT be depended on after the call to generic_make_request.
1208  * Because of this, the caller should record the device address
1209  * information in b_dev and b_blocknr.
1210  *
1211  * Apart from those fields mentioned above, no other fields, and in
1212  * particular, no other flags, are changed by generic_make_request or
1213  * any lower level drivers.
1214  * */
generic_make_request(int rw,struct buffer_head * bh)1215 void generic_make_request (int rw, struct buffer_head * bh)
1216 {
1217 	int major = MAJOR(bh->b_rdev);
1218 	int minorsize = 0;
1219 	request_queue_t *q;
1220 
1221 	if (!bh->b_end_io)
1222 		BUG();
1223 
1224 	/* Test device size, when known. */
1225 	if (blk_size[major])
1226 		minorsize = blk_size[major][MINOR(bh->b_rdev)];
1227 	if (minorsize) {
1228 		unsigned long maxsector = (minorsize << 1) + 1;
1229 		unsigned long sector = bh->b_rsector;
1230 		unsigned int count = bh->b_size >> 9;
1231 
1232 		if (maxsector < count || maxsector - count < sector) {
1233 			/* Yecch */
1234 			bh->b_state &= ~(1 << BH_Dirty);
1235 
1236 			/* This may well happen - the kernel calls bread()
1237 			   without checking the size of the device, e.g.,
1238 			   when mounting a device. */
1239 			printk(KERN_INFO
1240 			       "attempt to access beyond end of device\n");
1241 			printk(KERN_INFO "%s: rw=%d, want=%ld, limit=%d\n",
1242 			       kdevname(bh->b_rdev), rw,
1243 			       (sector + count)>>1, minorsize);
1244 
1245 			bh->b_end_io(bh, 0);
1246 			return;
1247 		}
1248 	}
1249 
1250 	/*
1251 	 * Resolve the mapping until finished. (drivers are
1252 	 * still free to implement/resolve their own stacking
1253 	 * by explicitly returning 0)
1254 	 */
1255 	/* NOTE: we don't repeat the blk_size check for each new device.
1256 	 * Stacking drivers are expected to know what they are doing.
1257 	 */
1258 	do {
1259 		q = __blk_get_queue(bh->b_rdev);
1260 		if (!q) {
1261 			printk(KERN_ERR
1262 			       "generic_make_request: Trying to access "
1263 			       "nonexistent block-device %s (%ld)\n",
1264 			       kdevname(bh->b_rdev), bh->b_rsector);
1265 			buffer_IO_error(bh);
1266 			break;
1267 		}
1268 	} while (q->make_request_fn(q, rw, bh));
1269 }
1270 
1271 
1272 /**
1273  * submit_bh: submit a buffer_head to the block device later for I/O
1274  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
1275  * @bh: The &struct buffer_head which describes the I/O
1276  *
1277  * submit_bh() is very similar in purpose to generic_make_request(), and
1278  * uses that function to do most of the work.
1279  *
1280  * The extra functionality provided by submit_bh is to determine
1281  * b_rsector from b_blocknr and b_size, and to set b_rdev from b_dev.
1282  * This is is appropriate for IO requests that come from the buffer
1283  * cache and page cache which (currently) always use aligned blocks.
1284  */
submit_bh(int rw,struct buffer_head * bh)1285 void submit_bh(int rw, struct buffer_head * bh)
1286 {
1287 	int count = bh->b_size >> 9;
1288 
1289 	if (!test_bit(BH_Lock, &bh->b_state))
1290 		BUG();
1291 
1292 	set_bit(BH_Req, &bh->b_state);
1293 	set_bit(BH_Launder, &bh->b_state);
1294 
1295 	/*
1296 	 * First step, 'identity mapping' - RAID or LVM might
1297 	 * further remap this.
1298 	 */
1299 	bh->b_rdev = bh->b_dev;
1300 	bh->b_rsector = bh->b_blocknr * count;
1301 
1302 	get_bh(bh);
1303 	generic_make_request(rw, bh);
1304 
1305 	/* fix race condition with wait_on_buffer() */
1306 	smp_mb(); /* spin_unlock may have inclusive semantics */
1307 	if (waitqueue_active(&bh->b_wait))
1308 		wake_up(&bh->b_wait);
1309 
1310 	if (block_dump)
1311 		printk(KERN_DEBUG "%s: %s block %lu/%u on %s\n", current->comm, rw == WRITE ? "WRITE" : "READ", bh->b_rsector, count, kdevname(bh->b_rdev));
1312 
1313 	put_bh(bh);
1314 	switch (rw) {
1315 		case WRITE:
1316 			kstat.pgpgout += count;
1317 			break;
1318 		default:
1319 			kstat.pgpgin += count;
1320 			break;
1321 	}
1322 }
1323 
1324 /**
1325  * ll_rw_block: low-level access to block devices
1326  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
1327  * @nr: number of &struct buffer_heads in the array
1328  * @bhs: array of pointers to &struct buffer_head
1329  *
1330  * ll_rw_block() takes an array of pointers to &struct buffer_heads,
1331  * and requests an I/O operation on them, either a %READ or a %WRITE.
1332  * The third %READA option is described in the documentation for
1333  * generic_make_request() which ll_rw_block() calls.
1334  *
1335  * This function provides extra functionality that is not in
1336  * generic_make_request() that is relevant to buffers in the buffer
1337  * cache or page cache.  In particular it drops any buffer that it
1338  * cannot get a lock on (with the BH_Lock state bit), any buffer that
1339  * appears to be clean when doing a write request, and any buffer that
1340  * appears to be up-to-date when doing read request.  Further it marks
1341  * as clean buffers that are processed for writing (the buffer cache
1342  * wont assume that they are actually clean until the buffer gets
1343  * unlocked).
1344  *
1345  * ll_rw_block sets b_end_io to simple completion handler that marks
1346  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
1347  * any waiters.  As client that needs a more interesting completion
1348  * routine should call submit_bh() (or generic_make_request())
1349  * directly.
1350  *
1351  * Caveat:
1352  *  All of the buffers must be for the same device, and must also be
1353  *  of the current approved size for the device.  */
1354 
ll_rw_block(int rw,int nr,struct buffer_head * bhs[])1355 void ll_rw_block(int rw, int nr, struct buffer_head * bhs[])
1356 {
1357 	unsigned int major;
1358 	int correct_size;
1359 	int i;
1360 
1361 	if (!nr)
1362 		return;
1363 
1364 	major = MAJOR(bhs[0]->b_dev);
1365 
1366 	/* Determine correct block size for this device. */
1367 	correct_size = get_hardsect_size(bhs[0]->b_dev);
1368 
1369 	/* Verify requested block sizes. */
1370 	for (i = 0; i < nr; i++) {
1371 		struct buffer_head *bh = bhs[i];
1372 		if (bh->b_size % correct_size) {
1373 			printk(KERN_NOTICE "ll_rw_block: device %s: "
1374 			       "only %d-char blocks implemented (%u)\n",
1375 			       kdevname(bhs[0]->b_dev),
1376 			       correct_size, bh->b_size);
1377 			goto sorry;
1378 		}
1379 	}
1380 
1381 	if ((rw & WRITE) && is_read_only(bhs[0]->b_dev)) {
1382 		printk(KERN_NOTICE "Can't write to read-only device %s\n",
1383 		       kdevname(bhs[0]->b_dev));
1384 		goto sorry;
1385 	}
1386 
1387 	for (i = 0; i < nr; i++) {
1388 		struct buffer_head *bh = bhs[i];
1389 
1390 		lock_buffer(bh);
1391 
1392 		/* We have the buffer lock */
1393 		atomic_inc(&bh->b_count);
1394 		bh->b_end_io = end_buffer_io_sync;
1395 
1396 		switch(rw) {
1397 		case WRITE:
1398 			if (!atomic_set_buffer_clean(bh))
1399 				/* Hmmph! Nothing to write */
1400 				goto end_io;
1401 			__mark_buffer_clean(bh);
1402 			break;
1403 
1404 		case READA:
1405 		case READ:
1406 			if (buffer_uptodate(bh))
1407 				/* Hmmph! Already have it */
1408 				goto end_io;
1409 			break;
1410 		default:
1411 			BUG();
1412 	end_io:
1413 			bh->b_end_io(bh, test_bit(BH_Uptodate, &bh->b_state));
1414 			continue;
1415 		}
1416 
1417 		submit_bh(rw, bh);
1418 	}
1419 	return;
1420 
1421 sorry:
1422 	/* Make sure we don't get infinite dirty retries.. */
1423 	for (i = 0; i < nr; i++)
1424 		mark_buffer_clean(bhs[i]);
1425 }
1426 
1427 #ifdef CONFIG_STRAM_SWAP
1428 extern int stram_device_init (void);
1429 #endif
1430 
blk_writeback_timer(unsigned long data)1431 static void blk_writeback_timer(unsigned long data)
1432 {
1433 	wakeup_bdflush();
1434 	wakeup_kupdate();
1435 }
1436 
1437 /**
1438  * end_that_request_first - end I/O on one buffer.
1439  * @req:      the request being processed
1440  * @uptodate: 0 for I/O error
1441  * @name:     the name printed for an I/O error
1442  *
1443  * Description:
1444  *     Ends I/O on the first buffer attached to @req, and sets it up
1445  *     for the next buffer_head (if any) in the cluster.
1446  *
1447  * Return:
1448  *     0 - we are done with this request, call end_that_request_last()
1449  *     1 - still buffers pending for this request
1450  *
1451  * Caveat:
1452  *     Drivers implementing their own end_request handling must call
1453  *     blk_finished_io() appropriately.
1454  **/
1455 
end_that_request_first(struct request * req,int uptodate,char * name)1456 int end_that_request_first (struct request *req, int uptodate, char *name)
1457 {
1458 	struct buffer_head * bh;
1459 	int nsect;
1460 
1461 	req->errors = 0;
1462 	if (!uptodate)
1463 		printk("end_request: I/O error, dev %s (%s), sector %lu\n",
1464 			kdevname(req->rq_dev), name, req->sector);
1465 
1466 	if ((bh = req->bh) != NULL) {
1467 		nsect = bh->b_size >> 9;
1468 		blk_finished_io(nsect);
1469 		blk_finished_sectors(req, nsect);
1470 		req->bh = bh->b_reqnext;
1471 		bh->b_reqnext = NULL;
1472 		bh->b_end_io(bh, uptodate);
1473 		if ((bh = req->bh) != NULL) {
1474 			req->hard_sector += nsect;
1475 			req->hard_nr_sectors -= nsect;
1476 			req->sector = req->hard_sector;
1477 			req->nr_sectors = req->hard_nr_sectors;
1478 
1479 			req->current_nr_sectors = bh->b_size >> 9;
1480 			req->hard_cur_sectors = req->current_nr_sectors;
1481 			if (req->nr_sectors < req->current_nr_sectors) {
1482 				req->nr_sectors = req->current_nr_sectors;
1483 				printk("end_request: buffer-list destroyed\n");
1484 			}
1485 			req->buffer = bh->b_data;
1486 			return 1;
1487 		}
1488 	}
1489 	return 0;
1490 }
1491 
1492 extern int laptop_mode;
1493 
end_that_request_last(struct request * req)1494 void end_that_request_last(struct request *req)
1495 {
1496 	struct completion *waiting = req->waiting;
1497 
1498 	/*
1499 	 * schedule the writeout of pending dirty data when the disk is idle
1500 	 */
1501 	if (laptop_mode && req->cmd == READ)
1502 		mod_timer(&writeback_timer, jiffies + 5 * HZ);
1503 
1504 	req_finished_io(req);
1505 	blkdev_release_request(req);
1506 	if (waiting)
1507 		complete(waiting);
1508 }
1509 
blk_dev_init(void)1510 int __init blk_dev_init(void)
1511 {
1512 	struct blk_dev_struct *dev;
1513 
1514 	request_cachep = kmem_cache_create("blkdev_requests",
1515 					   sizeof(struct request),
1516 					   0, SLAB_HWCACHE_ALIGN, NULL, NULL);
1517 
1518 	if (!request_cachep)
1519 		panic("Can't create request pool slab cache\n");
1520 
1521 	for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;)
1522 		dev->queue = NULL;
1523 
1524 	memset(ro_bits,0,sizeof(ro_bits));
1525 	memset(max_readahead, 0, sizeof(max_readahead));
1526 	memset(max_sectors, 0, sizeof(max_sectors));
1527 
1528 	blk_max_low_pfn = max_low_pfn - 1;
1529 	blk_max_pfn = max_pfn - 1;
1530 
1531 	init_timer(&writeback_timer);
1532 	writeback_timer.function = blk_writeback_timer;
1533 
1534 #ifdef CONFIG_AMIGA_Z2RAM
1535 	z2_init();
1536 #endif
1537 #ifdef CONFIG_STRAM_SWAP
1538 	stram_device_init();
1539 #endif
1540 #ifdef CONFIG_ISP16_CDI
1541 	isp16_init();
1542 #endif
1543 #ifdef CONFIG_BLK_DEV_PS2
1544 	ps2esdi_init();
1545 #endif
1546 #ifdef CONFIG_BLK_DEV_XD
1547 	xd_init();
1548 #endif
1549 #ifdef CONFIG_BLK_DEV_MFM
1550 	mfm_init();
1551 #endif
1552 #ifdef CONFIG_PARIDE
1553 	{ extern void paride_init(void); paride_init(); };
1554 #endif
1555 #ifdef CONFIG_MAC_FLOPPY
1556 	swim3_init();
1557 #endif
1558 #ifdef CONFIG_BLK_DEV_SWIM_IOP
1559 	swimiop_init();
1560 #endif
1561 #ifdef CONFIG_AMIGA_FLOPPY
1562 	amiga_floppy_init();
1563 #endif
1564 #ifdef CONFIG_ATARI_FLOPPY
1565 	atari_floppy_init();
1566 #endif
1567 #ifdef CONFIG_BLK_DEV_FD
1568 	floppy_init();
1569 #else
1570 #if defined(__i386__)	/* Do we even need this? */
1571 	outb_p(0xc, 0x3f2);
1572 #endif
1573 #endif
1574 #ifdef CONFIG_CDU31A
1575 	cdu31a_init();
1576 #endif
1577 #ifdef CONFIG_ATARI_ACSI
1578 	acsi_init();
1579 #endif
1580 #ifdef CONFIG_MCD
1581 	mcd_init();
1582 #endif
1583 #ifdef CONFIG_MCDX
1584 	mcdx_init();
1585 #endif
1586 #ifdef CONFIG_SBPCD
1587 	sbpcd_init();
1588 #endif
1589 #ifdef CONFIG_AZTCD
1590 	aztcd_init();
1591 #endif
1592 #ifdef CONFIG_CDU535
1593 	sony535_init();
1594 #endif
1595 #ifdef CONFIG_GSCD
1596 	gscd_init();
1597 #endif
1598 #ifdef CONFIG_CM206
1599 	cm206_init();
1600 #endif
1601 #ifdef CONFIG_OPTCD
1602 	optcd_init();
1603 #endif
1604 #ifdef CONFIG_SJCD
1605 	sjcd_init();
1606 #endif
1607 #ifdef CONFIG_APBLOCK
1608 	ap_init();
1609 #endif
1610 #ifdef CONFIG_DDV
1611 	ddv_init();
1612 #endif
1613 #ifdef CONFIG_MDISK
1614 	mdisk_init();
1615 #endif
1616 #ifdef CONFIG_DASD
1617 	dasd_init();
1618 #endif
1619 #if defined(CONFIG_S390_TAPE) && defined(CONFIG_S390_TAPE_BLOCK)
1620 	tapeblock_init();
1621 #endif
1622 #ifdef CONFIG_BLK_DEV_XPRAM
1623         xpram_init();
1624 #endif
1625 
1626 #ifdef CONFIG_SUN_JSFLASH
1627 	jsfd_init();
1628 #endif
1629 	return 0;
1630 };
1631 
1632 EXPORT_SYMBOL(io_request_lock);
1633 EXPORT_SYMBOL(end_that_request_first);
1634 EXPORT_SYMBOL(end_that_request_last);
1635 EXPORT_SYMBOL(blk_grow_request_list);
1636 EXPORT_SYMBOL(blk_init_queue);
1637 EXPORT_SYMBOL(blk_get_queue);
1638 EXPORT_SYMBOL(blk_cleanup_queue);
1639 EXPORT_SYMBOL(blk_queue_headactive);
1640 EXPORT_SYMBOL(blk_queue_throttle_sectors);
1641 EXPORT_SYMBOL(blk_queue_make_request);
1642 EXPORT_SYMBOL(generic_make_request);
1643 EXPORT_SYMBOL(blkdev_release_request);
1644 EXPORT_SYMBOL(generic_unplug_device);
1645 EXPORT_SYMBOL(blk_queue_bounce_limit);
1646 EXPORT_SYMBOL(blk_max_low_pfn);
1647 EXPORT_SYMBOL(blk_max_pfn);
1648 EXPORT_SYMBOL(blk_seg_merge_ok);
1649 EXPORT_SYMBOL(blk_nohighio);
1650