1 /*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */
6
7 /*
8 * 'buffer.c' implements the buffer-cache functions. Race-conditions have
9 * been avoided by NEVER letting an interrupt change a buffer (except for the
10 * data, of course), but instead letting the caller do it.
11 */
12
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14
15 /* Removed a lot of unnecessary code and simplified things now that
16 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17 */
18
19 /* Speed up hash, lru, and free list operations. Use gfp() for allocating
20 * hash table, use SLAB cache for buffer heads. -DaveM
21 */
22
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24 * - RMK
25 */
26
27 /* Thread it... -DaveM */
28
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/module.h>
49 #include <linux/completion.h>
50
51 #include <asm/uaccess.h>
52 #include <asm/io.h>
53 #include <asm/bitops.h>
54 #include <asm/mmu_context.h>
55
56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
58 number of unused buffer heads */
59
60 /* Anti-deadlock ordering:
61 * lru_list_lock > hash_table_lock > unused_list_lock
62 */
63
64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
65
66 /*
67 * Hash table gook..
68 */
69 static unsigned int bh_hash_mask;
70 static unsigned int bh_hash_shift;
71 static struct buffer_head **hash_table;
72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
73
74 static struct buffer_head *lru_list[NR_LIST];
75
76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
77 #define lru_list_lock lru_list_lock_cacheline.lock
78
79 static int nr_buffers_type[NR_LIST];
80 static unsigned long size_buffers_type[NR_LIST];
81
82 static struct buffer_head * unused_list;
83 static int nr_unused_buffer_heads;
84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
86
87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
88 static int osync_buffers_list(struct list_head *);
89 static void __refile_buffer(struct buffer_head *);
90
91 /*
92 * A global sysctl-controlled flag which puts the machine into "laptop mode"
93 */
94 int laptop_mode;
95
96 static DECLARE_WAIT_QUEUE_HEAD(kupdate_wait);
97
98 /* This is used by some architectures to estimate available memory. */
99 atomic_t buffermem_pages = ATOMIC_INIT(0);
100
101 /* Here is the parameter block for the bdflush process. If you add or
102 * remove any of the parameters, make sure to update kernel/sysctl.c
103 * and the documentation at linux/Documentation/sysctl/vm.txt.
104 */
105
106 #define N_PARAM 9
107
108 /* The dummy values in this structure are left in there for compatibility
109 * with old programs that play with the /proc entries.
110 */
111 union bdflush_param {
112 struct {
113 int nfract; /* Percentage of buffer cache dirty to
114 activate bdflush */
115 int ndirty; /* Maximum number of dirty blocks to write out per
116 wake-cycle */
117 int dummy2; /* old "nrefill" */
118 int dummy3; /* unused */
119 int interval; /* jiffies delay between kupdate flushes */
120 int age_buffer; /* Time for normal buffer to age before we flush it */
121 int nfract_sync;/* Percentage of buffer cache dirty to
122 activate bdflush synchronously */
123 int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
124 int dummy5; /* unused */
125 } b_un;
126 unsigned int data[N_PARAM];
127 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
128
129 /* These are the min and max parameter values that we will allow to be assigned */
130 int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0};
131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
132
write_buffer_delay(struct buffer_head * bh)133 static inline int write_buffer_delay(struct buffer_head *bh)
134 {
135 struct page *page = bh->b_page;
136
137 if (!TryLockPage(page)) {
138 spin_unlock(&lru_list_lock);
139 unlock_buffer(bh);
140 page->mapping->a_ops->writepage(page);
141 return 1;
142 }
143
144 return 0;
145 }
146
write_buffer(struct buffer_head * bh)147 static inline void write_buffer(struct buffer_head *bh)
148 {
149 if (buffer_delay(bh)) {
150 struct page *page = bh->b_page;
151
152 lock_page(page);
153 if (buffer_delay(bh)) {
154 page->mapping->a_ops->writepage(page);
155 return;
156 }
157 unlock_page(page);
158 }
159
160 ll_rw_block(WRITE, 1, &bh);
161 }
162
unlock_buffer(struct buffer_head * bh)163 void fastcall unlock_buffer(struct buffer_head *bh)
164 {
165 clear_bit(BH_Wait_IO, &bh->b_state);
166 clear_bit(BH_Launder, &bh->b_state);
167 /*
168 * When a locked buffer is visible to the I/O layer BH_Launder
169 * is set. This means before unlocking we must clear BH_Launder,
170 * mb() on alpha and then clear BH_Lock, so no reader can see
171 * BH_Launder set on an unlocked buffer and then risk to deadlock.
172 */
173 smp_mb__after_clear_bit();
174 clear_bit(BH_Lock, &bh->b_state);
175 smp_mb__after_clear_bit();
176 if (waitqueue_active(&bh->b_wait))
177 wake_up(&bh->b_wait);
178 }
179
180 /*
181 * Note that the real wait_on_buffer() is an inline function that checks
182 * that the buffer is locked before calling this, so that unnecessary disk
183 * unplugging does not occur.
184 */
__wait_on_buffer(struct buffer_head * bh)185 void __wait_on_buffer(struct buffer_head * bh)
186 {
187 struct task_struct *tsk = current;
188 DECLARE_WAITQUEUE(wait, tsk);
189
190 get_bh(bh);
191 add_wait_queue(&bh->b_wait, &wait);
192 do {
193 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
194 if (!buffer_locked(bh))
195 break;
196 /*
197 * We must read tq_disk in TQ_ACTIVE after the
198 * add_wait_queue effect is visible to other cpus.
199 * We could unplug some line above it wouldn't matter
200 * but we can't do that right after add_wait_queue
201 * without an smp_mb() in between because spin_unlock
202 * has inclusive semantics.
203 * Doing it here is the most efficient place so we
204 * don't do a suprious unplug if we get a racy
205 * wakeup that make buffer_locked to return 0, and
206 * doing it here avoids an explicit smp_mb() we
207 * rely on the implicit one in set_task_state.
208 */
209 run_task_queue(&tq_disk);
210 schedule();
211 } while (buffer_locked(bh));
212 tsk->state = TASK_RUNNING;
213 remove_wait_queue(&bh->b_wait, &wait);
214 put_bh(bh);
215 }
216
217 /*
218 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
219 * unlock the buffer. This is what ll_rw_block uses too.
220 */
end_buffer_io_sync(struct buffer_head * bh,int uptodate)221 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
222 {
223 mark_buffer_uptodate(bh, uptodate);
224 unlock_buffer(bh);
225 put_bh(bh);
226 }
227
228 /*
229 * The buffers have been marked clean and locked. Just submit the dang
230 * things..
231 */
write_locked_buffers(struct buffer_head ** array,unsigned int count)232 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
233 {
234 do {
235 struct buffer_head * bh = *array++;
236 bh->b_end_io = end_buffer_io_sync;
237 submit_bh(WRITE, bh);
238 } while (--count);
239 }
240
241 /*
242 * Write some buffers from the head of the dirty queue.
243 *
244 * This must be called with the LRU lock held, and will
245 * return without it!
246 */
247 #define NRSYNC (32)
write_some_buffers(kdev_t dev)248 static int write_some_buffers(kdev_t dev)
249 {
250 struct buffer_head *next;
251 struct buffer_head *array[NRSYNC];
252 unsigned int count;
253 int nr;
254
255 next = lru_list[BUF_DIRTY];
256 nr = nr_buffers_type[BUF_DIRTY];
257 count = 0;
258 while (next && --nr >= 0) {
259 struct buffer_head * bh = next;
260 next = bh->b_next_free;
261
262 if (dev != NODEV && bh->b_dev != dev)
263 continue;
264 if (test_and_set_bit(BH_Lock, &bh->b_state))
265 continue;
266 if (buffer_delay(bh)) {
267 if (write_buffer_delay(bh)) {
268 if (count)
269 write_locked_buffers(array, count);
270 return -EAGAIN;
271 }
272 } else if (atomic_set_buffer_clean(bh)) {
273 __refile_buffer(bh);
274 get_bh(bh);
275 array[count++] = bh;
276 if (count < NRSYNC)
277 continue;
278
279 spin_unlock(&lru_list_lock);
280 write_locked_buffers(array, count);
281 return -EAGAIN;
282 }
283 unlock_buffer(bh);
284 __refile_buffer(bh);
285 }
286 spin_unlock(&lru_list_lock);
287
288 if (count)
289 write_locked_buffers(array, count);
290 return 0;
291 }
292
293 /*
294 * Write out all buffers on the dirty list.
295 */
write_unlocked_buffers(kdev_t dev)296 static void write_unlocked_buffers(kdev_t dev)
297 {
298 do
299 spin_lock(&lru_list_lock);
300 while (write_some_buffers(dev));
301 }
302
303 /*
304 * Wait for a buffer on the proper list.
305 *
306 * This must be called with the LRU lock held, and
307 * will return with it released.
308 */
wait_for_buffers(kdev_t dev,int index,int refile)309 static int wait_for_buffers(kdev_t dev, int index, int refile)
310 {
311 struct buffer_head * next;
312 int nr;
313
314 next = lru_list[index];
315 nr = nr_buffers_type[index];
316 while (next && --nr >= 0) {
317 struct buffer_head *bh = next;
318 next = bh->b_next_free;
319
320 if (!buffer_locked(bh)) {
321 if (refile)
322 __refile_buffer(bh);
323 continue;
324 }
325 if (dev != NODEV && bh->b_dev != dev)
326 continue;
327
328 get_bh(bh);
329 spin_unlock(&lru_list_lock);
330 wait_on_buffer (bh);
331 put_bh(bh);
332 return -EAGAIN;
333 }
334 spin_unlock(&lru_list_lock);
335 return 0;
336 }
337
wait_for_locked_buffers(kdev_t dev,int index,int refile)338 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
339 {
340 do {
341 spin_lock(&lru_list_lock);
342 } while (wait_for_buffers(dev, index, refile));
343 return 0;
344 }
345
346 /* Call sync_buffers with wait!=0 to ensure that the call does not
347 * return until all buffer writes have completed. Sync() may return
348 * before the writes have finished; fsync() may not.
349 */
350
351 /* Godamity-damn. Some buffers (bitmaps for filesystems)
352 * spontaneously dirty themselves without ever brelse being called.
353 * We will ultimately want to put these in a separate list, but for
354 * now we search all of the lists for dirty buffers.
355 */
sync_buffers(kdev_t dev,int wait)356 int sync_buffers(kdev_t dev, int wait)
357 {
358 int err = 0;
359
360 /* One pass for no-wait, three for wait:
361 * 0) write out all dirty, unlocked buffers;
362 * 1) wait for all dirty locked buffers;
363 * 2) write out all dirty, unlocked buffers;
364 * 2) wait for completion by waiting for all buffers to unlock.
365 */
366 write_unlocked_buffers(dev);
367 if (wait) {
368 err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
369 write_unlocked_buffers(dev);
370 err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
371 }
372 return err;
373 }
374 EXPORT_SYMBOL(sync_buffers);
375
fsync_super(struct super_block * sb)376 int fsync_super(struct super_block *sb)
377 {
378 kdev_t dev = sb->s_dev;
379 sync_buffers(dev, 0);
380
381 lock_kernel();
382 sync_inodes_sb(sb);
383 DQUOT_SYNC_SB(sb);
384 lock_super(sb);
385 if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
386 sb->s_op->write_super(sb);
387 unlock_super(sb);
388 if (sb->s_op && sb->s_op->sync_fs)
389 sb->s_op->sync_fs(sb);
390 unlock_kernel();
391
392 return sync_buffers(dev, 1);
393 }
394
fsync_no_super(kdev_t dev)395 int fsync_no_super(kdev_t dev)
396 {
397 sync_buffers(dev, 0);
398 return sync_buffers(dev, 1);
399 }
400
fsync_dev(kdev_t dev)401 int fsync_dev(kdev_t dev)
402 {
403 sync_buffers(dev, 0);
404
405 lock_kernel();
406 sync_inodes(dev);
407 DQUOT_SYNC_DEV(dev);
408 sync_supers(dev, 1);
409 unlock_kernel();
410
411 return sync_buffers(dev, 1);
412 }
413
414 /*
415 * There's no real reason to pretend we should
416 * ever do anything differently
417 */
sync_dev(kdev_t dev)418 void sync_dev(kdev_t dev)
419 {
420 fsync_dev(dev);
421 }
422
sys_sync(void)423 asmlinkage long sys_sync(void)
424 {
425 fsync_dev(0);
426 return 0;
427 }
428
429 /*
430 * filp may be NULL if called via the msync of a vma.
431 */
432
file_fsync(struct file * filp,struct dentry * dentry,int datasync)433 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
434 {
435 struct inode * inode = dentry->d_inode;
436 struct super_block * sb;
437 kdev_t dev;
438 int ret;
439
440 lock_kernel();
441 /* sync the inode to buffers */
442 write_inode_now(inode, 0);
443
444 /* sync the superblock to buffers */
445 sb = inode->i_sb;
446 lock_super(sb);
447 if (sb->s_op && sb->s_op->write_super)
448 sb->s_op->write_super(sb);
449 unlock_super(sb);
450
451 /* .. finally sync the buffers to disk */
452 dev = inode->i_dev;
453 ret = sync_buffers(dev, 1);
454 unlock_kernel();
455 return ret;
456 }
457
sys_fsync(unsigned int fd)458 asmlinkage long sys_fsync(unsigned int fd)
459 {
460 struct file * file;
461 struct dentry * dentry;
462 struct inode * inode;
463 int ret, err;
464
465 ret = -EBADF;
466 file = fget(fd);
467 if (!file)
468 goto out;
469
470 dentry = file->f_dentry;
471 inode = dentry->d_inode;
472
473 ret = -EINVAL;
474 if (!file->f_op || !file->f_op->fsync) {
475 /* Why? We can still call filemap_fdatasync */
476 goto out_putf;
477 }
478
479 /* We need to protect against concurrent writers.. */
480 down(&inode->i_sem);
481 ret = filemap_fdatasync(inode->i_mapping);
482 err = file->f_op->fsync(file, dentry, 0);
483 if (err && !ret)
484 ret = err;
485 err = filemap_fdatawait(inode->i_mapping);
486 if (err && !ret)
487 ret = err;
488 up(&inode->i_sem);
489
490 out_putf:
491 fput(file);
492 out:
493 return ret;
494 }
495
do_fdatasync(struct file * file)496 int do_fdatasync(struct file *file)
497 {
498 int ret, err;
499 struct dentry *dentry;
500 struct inode *inode;
501
502 if (unlikely(!file->f_op || !file->f_op->fsync))
503 return -EINVAL;
504
505 dentry = file->f_dentry;
506 inode = dentry->d_inode;
507
508 ret = filemap_fdatasync(inode->i_mapping);
509 err = file->f_op->fsync(file, dentry, 1);
510 if (err && !ret)
511 ret = err;
512 err = filemap_fdatawait(inode->i_mapping);
513 if (err && !ret)
514 ret = err;
515 return ret;
516 }
517
sys_fdatasync(unsigned int fd)518 asmlinkage long sys_fdatasync(unsigned int fd)
519 {
520 struct file * file;
521 struct inode *inode;
522 int ret;
523
524 ret = -EBADF;
525 file = fget(fd);
526 if (!file)
527 goto out;
528
529 inode = file->f_dentry->d_inode;
530 down(&inode->i_sem);
531 ret = do_fdatasync(file);
532 up(&inode->i_sem);
533
534 fput(file);
535 out:
536 return ret;
537 }
538
539 /* After several hours of tedious analysis, the following hash
540 * function won. Do not mess with it... -DaveM
541 */
542 #define _hashfn(dev,block) \
543 ((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
544 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
545 ((block) << (bh_hash_shift - 12))))
546 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
547
__insert_into_hash_list(struct buffer_head * bh)548 static inline void __insert_into_hash_list(struct buffer_head *bh)
549 {
550 struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
551 struct buffer_head *next = *head;
552
553 *head = bh;
554 bh->b_pprev = head;
555 bh->b_next = next;
556 if (next != NULL)
557 next->b_pprev = &bh->b_next;
558 }
559
__hash_unlink(struct buffer_head * bh)560 static __inline__ void __hash_unlink(struct buffer_head *bh)
561 {
562 struct buffer_head **pprev = bh->b_pprev;
563 if (pprev) {
564 struct buffer_head *next = bh->b_next;
565 if (next)
566 next->b_pprev = pprev;
567 *pprev = next;
568 bh->b_pprev = NULL;
569 }
570 }
571
__insert_into_lru_list(struct buffer_head * bh,int blist)572 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
573 {
574 struct buffer_head **bhp = &lru_list[blist];
575
576 if (bh->b_prev_free || bh->b_next_free) BUG();
577
578 if(!*bhp) {
579 *bhp = bh;
580 bh->b_prev_free = bh;
581 }
582 bh->b_next_free = *bhp;
583 bh->b_prev_free = (*bhp)->b_prev_free;
584 (*bhp)->b_prev_free->b_next_free = bh;
585 (*bhp)->b_prev_free = bh;
586 nr_buffers_type[blist]++;
587 size_buffers_type[blist] += bh->b_size >> 9;
588 }
589
__remove_from_lru_list(struct buffer_head * bh)590 static void __remove_from_lru_list(struct buffer_head * bh)
591 {
592 struct buffer_head *next = bh->b_next_free;
593 if (next) {
594 struct buffer_head *prev = bh->b_prev_free;
595 int blist = bh->b_list;
596
597 prev->b_next_free = next;
598 next->b_prev_free = prev;
599 if (lru_list[blist] == bh) {
600 if (next == bh)
601 next = NULL;
602 lru_list[blist] = next;
603 }
604 bh->b_next_free = NULL;
605 bh->b_prev_free = NULL;
606 nr_buffers_type[blist]--;
607 size_buffers_type[blist] -= bh->b_size >> 9;
608 }
609 }
610
611 /* must be called with both the hash_table_lock and the lru_list_lock
612 held */
__remove_from_queues(struct buffer_head * bh)613 static void __remove_from_queues(struct buffer_head *bh)
614 {
615 __hash_unlink(bh);
616 __remove_from_lru_list(bh);
617 }
618
remove_from_queues(struct buffer_head * bh)619 static void remove_from_queues(struct buffer_head *bh)
620 {
621 spin_lock(&lru_list_lock);
622 write_lock(&hash_table_lock);
623 __remove_from_queues(bh);
624 write_unlock(&hash_table_lock);
625 spin_unlock(&lru_list_lock);
626 }
627
get_hash_table(kdev_t dev,int block,int size)628 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
629 {
630 struct buffer_head *bh, **p = &hash(dev, block);
631
632 read_lock(&hash_table_lock);
633
634 for (;;) {
635 bh = *p;
636 if (!bh)
637 break;
638 p = &bh->b_next;
639 if (bh->b_blocknr != block)
640 continue;
641 if (bh->b_size != size)
642 continue;
643 if (bh->b_dev != dev)
644 continue;
645 get_bh(bh);
646 break;
647 }
648
649 read_unlock(&hash_table_lock);
650 return bh;
651 }
652
buffer_insert_list(struct buffer_head * bh,struct list_head * list)653 void fastcall buffer_insert_list(struct buffer_head *bh, struct list_head *list)
654 {
655 spin_lock(&lru_list_lock);
656 if (buffer_attached(bh))
657 list_del(&bh->b_inode_buffers);
658 set_buffer_attached(bh);
659 list_add_tail(&bh->b_inode_buffers, list);
660 spin_unlock(&lru_list_lock);
661 }
662
663 /*
664 * The caller must have the lru_list lock before calling the
665 * remove_inode_queue functions.
666 */
__remove_inode_queue(struct buffer_head * bh)667 static void __remove_inode_queue(struct buffer_head *bh)
668 {
669 list_del(&bh->b_inode_buffers);
670 clear_buffer_attached(bh);
671 }
672
remove_inode_queue(struct buffer_head * bh)673 static inline void remove_inode_queue(struct buffer_head *bh)
674 {
675 if (buffer_attached(bh))
676 __remove_inode_queue(bh);
677 }
678
inode_has_buffers(struct inode * inode)679 int inode_has_buffers(struct inode *inode)
680 {
681 int ret;
682
683 spin_lock(&lru_list_lock);
684 ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
685 spin_unlock(&lru_list_lock);
686
687 return ret;
688 }
689
690 /* If invalidate_buffers() will trash dirty buffers, it means some kind
691 of fs corruption is going on. Trashing dirty data always imply losing
692 information that was supposed to be just stored on the physical layer
693 by the user.
694
695 Thus invalidate_buffers in general usage is not allwowed to trash
696 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
697 be preserved. These buffers are simply skipped.
698
699 We also skip buffers which are still in use. For example this can
700 happen if a userspace program is reading the block device.
701
702 NOTE: In the case where the user removed a removable-media-disk even if
703 there's still dirty data not synced on disk (due a bug in the device driver
704 or due an error of the user), by not destroying the dirty buffers we could
705 generate corruption also on the next media inserted, thus a parameter is
706 necessary to handle this case in the most safe way possible (trying
707 to not corrupt also the new disk inserted with the data belonging to
708 the old now corrupted disk). Also for the ramdisk the natural thing
709 to do in order to release the ramdisk memory is to destroy dirty buffers.
710
711 These are two special cases. Normal usage imply the device driver
712 to issue a sync on the device (without waiting I/O completion) and
713 then an invalidate_buffers call that doesn't trash dirty buffers.
714
715 For handling cache coherency with the blkdev pagecache the 'update' case
716 is been introduced. It is needed to re-read from disk any pinned
717 buffer. NOTE: re-reading from disk is destructive so we can do it only
718 when we assume nobody is changing the buffercache under our I/O and when
719 we think the disk contains more recent information than the buffercache.
720 The update == 1 pass marks the buffers we need to update, the update == 2
721 pass does the actual I/O. */
invalidate_bdev(struct block_device * bdev,int destroy_dirty_buffers)722 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
723 {
724 int i, nlist, slept;
725 struct buffer_head * bh, * bh_next;
726 kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */
727
728 retry:
729 slept = 0;
730 spin_lock(&lru_list_lock);
731 for(nlist = 0; nlist < NR_LIST; nlist++) {
732 bh = lru_list[nlist];
733 if (!bh)
734 continue;
735 for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
736 bh_next = bh->b_next_free;
737
738 /* Another device? */
739 if (bh->b_dev != dev)
740 continue;
741 /* Not hashed? */
742 if (!bh->b_pprev)
743 continue;
744 if (buffer_locked(bh)) {
745 get_bh(bh);
746 spin_unlock(&lru_list_lock);
747 wait_on_buffer(bh);
748 slept = 1;
749 spin_lock(&lru_list_lock);
750 put_bh(bh);
751 }
752
753 write_lock(&hash_table_lock);
754 /* All buffers in the lru lists are mapped */
755 if (!buffer_mapped(bh))
756 BUG();
757 if (buffer_dirty(bh) && destroy_dirty_buffers)
758 printk("invalidate: dirty buffer\n");
759 if (!atomic_read(&bh->b_count)) {
760 if (destroy_dirty_buffers || !buffer_dirty(bh)) {
761 remove_inode_queue(bh);
762 }
763 } else if (!bdev->bd_openers)
764 printk("invalidate: busy buffer\n");
765
766 write_unlock(&hash_table_lock);
767 if (slept)
768 goto out;
769 }
770 }
771 out:
772 spin_unlock(&lru_list_lock);
773 if (slept)
774 goto retry;
775
776 /* Get rid of the page cache */
777 invalidate_inode_pages(bdev->bd_inode);
778 }
779
__invalidate_buffers(kdev_t dev,int destroy_dirty_buffers)780 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
781 {
782 struct block_device *bdev = bdget(dev);
783 if (bdev) {
784 invalidate_bdev(bdev, destroy_dirty_buffers);
785 bdput(bdev);
786 }
787 }
788
free_more_memory(void)789 static void free_more_memory(void)
790 {
791 balance_dirty();
792 wakeup_bdflush();
793 try_to_free_pages(GFP_NOIO);
794 run_task_queue(&tq_disk);
795 yield();
796 }
797
init_buffer(struct buffer_head * bh,bh_end_io_t * handler,void * private)798 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
799 {
800 bh->b_list = BUF_CLEAN;
801 bh->b_end_io = handler;
802 bh->b_private = private;
803 }
804
end_buffer_io_async(struct buffer_head * bh,int uptodate)805 void end_buffer_io_async(struct buffer_head * bh, int uptodate)
806 {
807 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
808 unsigned long flags;
809 struct buffer_head *tmp;
810 struct page *page;
811 int fullup = 1;
812
813 mark_buffer_uptodate(bh, uptodate);
814
815 /* This is a temporary buffer used for page I/O. */
816 page = bh->b_page;
817
818 if (!uptodate)
819 SetPageError(page);
820
821 /*
822 * Be _very_ careful from here on. Bad things can happen if
823 * two buffer heads end IO at almost the same time and both
824 * decide that the page is now completely done.
825 *
826 * Async buffer_heads are here only as labels for IO, and get
827 * thrown away once the IO for this page is complete. IO is
828 * deemed complete once all buffers have been visited
829 * (b_count==0) and are now unlocked. We must make sure that
830 * only the _last_ buffer that decrements its count is the one
831 * that unlock the page..
832 */
833 spin_lock_irqsave(&page_uptodate_lock, flags);
834 mark_buffer_async(bh, 0);
835 unlock_buffer(bh);
836 tmp = bh->b_this_page;
837 while (tmp != bh) {
838 if (buffer_locked(tmp)) {
839 if (buffer_async(tmp))
840 goto still_busy;
841 } else if (!buffer_uptodate(tmp))
842 fullup = 0;
843 tmp = tmp->b_this_page;
844 }
845
846 /* OK, the async IO on this page is complete. */
847 spin_unlock_irqrestore(&page_uptodate_lock, flags);
848
849 /*
850 * If none of the buffers had errors and all were uptodate
851 * then we can set the page uptodate:
852 */
853 if (fullup && !PageError(page))
854 SetPageUptodate(page);
855
856 UnlockPage(page);
857
858 return;
859
860 still_busy:
861 spin_unlock_irqrestore(&page_uptodate_lock, flags);
862 return;
863 }
864
set_buffer_async_io(struct buffer_head * bh)865 inline void set_buffer_async_io(struct buffer_head *bh)
866 {
867 bh->b_end_io = end_buffer_io_async;
868 mark_buffer_async(bh, 1);
869 }
870
871 /*
872 * Synchronise all the inode's dirty buffers to the disk.
873 *
874 * We have conflicting pressures: we want to make sure that all
875 * initially dirty buffers get waited on, but that any subsequently
876 * dirtied buffers don't. After all, we don't want fsync to last
877 * forever if somebody is actively writing to the file.
878 *
879 * Do this in two main stages: first we copy dirty buffers to a
880 * temporary inode list, queueing the writes as we go. Then we clean
881 * up, waiting for those writes to complete.
882 *
883 * During this second stage, any subsequent updates to the file may end
884 * up refiling the buffer on the original inode's dirty list again, so
885 * there is a chance we will end up with a buffer queued for write but
886 * not yet completed on that list. So, as a final cleanup we go through
887 * the osync code to catch these locked, dirty buffers without requeuing
888 * any newly dirty buffers for write.
889 */
fsync_buffers_list(struct list_head * list)890 int fsync_buffers_list(struct list_head *list)
891 {
892 struct buffer_head *bh;
893 struct list_head tmp;
894 int err = 0, err2;
895
896 INIT_LIST_HEAD(&tmp);
897
898 spin_lock(&lru_list_lock);
899
900 while (!list_empty(list)) {
901 bh = BH_ENTRY(list->next);
902 list_del(&bh->b_inode_buffers);
903 if (!buffer_dirty(bh) && !buffer_locked(bh))
904 clear_buffer_attached(bh);
905 else {
906 set_buffer_attached(bh);
907 list_add(&bh->b_inode_buffers, &tmp);
908 if (buffer_dirty(bh)) {
909 get_bh(bh);
910 spin_unlock(&lru_list_lock);
911 /*
912 * Wait I/O completion before submitting
913 * the buffer, to be sure the write will
914 * be effective on the latest data in
915 * the buffer. (otherwise - if there's old
916 * I/O in flight - write_buffer would become
917 * a noop)
918 */
919 wait_on_buffer(bh);
920 write_buffer(bh);
921 brelse(bh);
922 spin_lock(&lru_list_lock);
923 }
924 }
925 }
926
927 while (!list_empty(&tmp)) {
928 bh = BH_ENTRY(tmp.prev);
929 remove_inode_queue(bh);
930 get_bh(bh);
931 spin_unlock(&lru_list_lock);
932 wait_on_buffer(bh);
933 if (!buffer_uptodate(bh))
934 err = -EIO;
935 brelse(bh);
936 spin_lock(&lru_list_lock);
937 }
938
939 spin_unlock(&lru_list_lock);
940 err2 = osync_buffers_list(list);
941
942 if (err)
943 return err;
944 else
945 return err2;
946 }
947
948 /*
949 * osync is designed to support O_SYNC io. It waits synchronously for
950 * all already-submitted IO to complete, but does not queue any new
951 * writes to the disk.
952 *
953 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
954 * you dirty the buffers, and then use osync_buffers_list to wait for
955 * completion. Any other dirty buffers which are not yet queued for
956 * write will not be flushed to disk by the osync.
957 */
osync_buffers_list(struct list_head * list)958 static int osync_buffers_list(struct list_head *list)
959 {
960 struct buffer_head *bh;
961 struct list_head *p;
962 int err = 0;
963
964 spin_lock(&lru_list_lock);
965
966 repeat:
967 list_for_each_prev(p, list) {
968 bh = BH_ENTRY(p);
969 if (buffer_locked(bh)) {
970 get_bh(bh);
971 spin_unlock(&lru_list_lock);
972 wait_on_buffer(bh);
973 if (!buffer_uptodate(bh))
974 err = -EIO;
975 brelse(bh);
976 spin_lock(&lru_list_lock);
977 goto repeat;
978 }
979 }
980
981 spin_unlock(&lru_list_lock);
982 return err;
983 }
984
985 /*
986 * Invalidate any and all dirty buffers on a given inode. We are
987 * probably unmounting the fs, but that doesn't mean we have already
988 * done a sync(). Just drop the buffers from the inode list.
989 */
invalidate_inode_buffers(struct inode * inode)990 void invalidate_inode_buffers(struct inode *inode)
991 {
992 struct list_head * entry;
993
994 spin_lock(&lru_list_lock);
995 while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
996 remove_inode_queue(BH_ENTRY(entry));
997 while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
998 remove_inode_queue(BH_ENTRY(entry));
999 spin_unlock(&lru_list_lock);
1000 }
1001
1002
1003 /*
1004 * Ok, this is getblk, and it isn't very clear, again to hinder
1005 * race-conditions. Most of the code is seldom used, (ie repeating),
1006 * so it should be much more efficient than it looks.
1007 *
1008 * The algorithm is changed: hopefully better, and an elusive bug removed.
1009 *
1010 * 14.02.92: changed it to sync dirty buffers a bit: better performance
1011 * when the filesystem starts to get full of dirty blocks (I hope).
1012 */
getblk(kdev_t dev,int block,int size)1013 struct buffer_head * getblk(kdev_t dev, int block, int size)
1014 {
1015 for (;;) {
1016 struct buffer_head * bh;
1017
1018 bh = get_hash_table(dev, block, size);
1019 if (bh) {
1020 touch_buffer(bh);
1021 return bh;
1022 }
1023
1024 if (!grow_buffers(dev, block, size))
1025 free_more_memory();
1026 }
1027 }
1028
1029 /* -1 -> no need to flush
1030 0 -> async flush
1031 1 -> sync flush (wait for I/O completion) */
balance_dirty_state(void)1032 static int balance_dirty_state(void)
1033 {
1034 unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1035
1036 dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1037 tot = nr_free_buffer_pages();
1038
1039 dirty *= 100;
1040 soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1041 hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1042
1043 /* First, check for the "real" dirty limit. */
1044 if (dirty > soft_dirty_limit) {
1045 if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1046 return 1;
1047 return 0;
1048 }
1049
1050 return -1;
1051 }
1052
bdflush_stop(void)1053 static int bdflush_stop(void)
1054 {
1055 unsigned long dirty, tot, dirty_limit;
1056
1057 dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1058 tot = nr_free_buffer_pages();
1059
1060 dirty *= 100;
1061 dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1062
1063 if (!laptop_mode && dirty > dirty_limit)
1064 return 0;
1065 return 1;
1066 }
1067
1068 /*
1069 * if a new dirty buffer is created we need to balance bdflush.
1070 *
1071 * in the future we might want to make bdflush aware of different
1072 * pressures on different devices - thus the (currently unused)
1073 * 'dev' parameter.
1074 */
balance_dirty(void)1075 void balance_dirty(void)
1076 {
1077 int state = balance_dirty_state();
1078
1079 if (state < 0)
1080 return;
1081
1082 wakeup_bdflush();
1083
1084 /*
1085 * And if we're _really_ out of balance, wait for
1086 * some of the dirty/locked buffers ourselves.
1087 * This will throttle heavy writers.
1088 */
1089 if (state > 0) {
1090 spin_lock(&lru_list_lock);
1091 write_some_buffers(NODEV);
1092 }
1093 }
1094 EXPORT_SYMBOL(balance_dirty);
1095
__mark_dirty(struct buffer_head * bh)1096 inline void fastcall __mark_dirty(struct buffer_head *bh)
1097 {
1098 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1099 refile_buffer(bh);
1100 }
1101
1102 /* atomic version, the user must call balance_dirty() by hand
1103 as soon as it become possible to block */
__mark_buffer_dirty(struct buffer_head * bh)1104 void fastcall __mark_buffer_dirty(struct buffer_head *bh)
1105 {
1106 if (!atomic_set_buffer_dirty(bh))
1107 __mark_dirty(bh);
1108 }
1109
mark_buffer_dirty(struct buffer_head * bh)1110 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1111 {
1112 if (!atomic_set_buffer_dirty(bh)) {
1113 if (block_dump)
1114 printk("%s: dirtied buffer\n", current->comm);
1115 __mark_dirty(bh);
1116 balance_dirty();
1117 }
1118 }
1119
set_buffer_flushtime(struct buffer_head * bh)1120 void set_buffer_flushtime(struct buffer_head *bh)
1121 {
1122 bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1123 }
1124 EXPORT_SYMBOL(set_buffer_flushtime);
1125
get_buffer_flushtime(void)1126 int get_buffer_flushtime(void)
1127 {
1128 return bdf_prm.b_un.interval;
1129 }
1130 EXPORT_SYMBOL(get_buffer_flushtime);
1131
1132 /*
1133 * A buffer may need to be moved from one buffer list to another
1134 * (e.g. in case it is not shared any more). Handle this.
1135 */
__refile_buffer(struct buffer_head * bh)1136 static void __refile_buffer(struct buffer_head *bh)
1137 {
1138 int dispose = BUF_CLEAN;
1139 if (buffer_locked(bh))
1140 dispose = BUF_LOCKED;
1141 if (buffer_dirty(bh))
1142 dispose = BUF_DIRTY;
1143 if (dispose != bh->b_list) {
1144 __remove_from_lru_list(bh);
1145 bh->b_list = dispose;
1146 if (dispose == BUF_CLEAN)
1147 remove_inode_queue(bh);
1148 __insert_into_lru_list(bh, dispose);
1149 }
1150 }
1151
refile_buffer(struct buffer_head * bh)1152 void refile_buffer(struct buffer_head *bh)
1153 {
1154 spin_lock(&lru_list_lock);
1155 __refile_buffer(bh);
1156 spin_unlock(&lru_list_lock);
1157 }
1158
1159 /*
1160 * Release a buffer head
1161 */
__brelse(struct buffer_head * buf)1162 void __brelse(struct buffer_head * buf)
1163 {
1164 if (atomic_read(&buf->b_count)) {
1165 put_bh(buf);
1166 return;
1167 }
1168 printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1169 }
1170
1171 /*
1172 * bforget() is like brelse(), except it discards any
1173 * potentially dirty data.
1174 */
__bforget(struct buffer_head * buf)1175 void __bforget(struct buffer_head * buf)
1176 {
1177 mark_buffer_clean(buf);
1178 __brelse(buf);
1179 }
1180
1181 /**
1182 * bread() - reads a specified block and returns the bh
1183 * @block: number of block
1184 * @size: size (in bytes) to read
1185 *
1186 * Reads a specified block, and returns buffer head that
1187 * contains it. It returns NULL if the block was unreadable.
1188 */
bread(kdev_t dev,int block,int size)1189 struct buffer_head * bread(kdev_t dev, int block, int size)
1190 {
1191 struct buffer_head * bh;
1192
1193 bh = getblk(dev, block, size);
1194 if (buffer_uptodate(bh))
1195 return bh;
1196 set_bit(BH_Sync, &bh->b_state);
1197 ll_rw_block(READ, 1, &bh);
1198 wait_on_buffer(bh);
1199 if (buffer_uptodate(bh))
1200 return bh;
1201 brelse(bh);
1202 return NULL;
1203 }
1204
1205 /*
1206 * Note: the caller should wake up the buffer_wait list if needed.
1207 */
__put_unused_buffer_head(struct buffer_head * bh)1208 static void __put_unused_buffer_head(struct buffer_head * bh)
1209 {
1210 if (unlikely(buffer_attached(bh)))
1211 BUG();
1212 if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1213 kmem_cache_free(bh_cachep, bh);
1214 } else {
1215 bh->b_dev = B_FREE;
1216 bh->b_blocknr = -1;
1217 bh->b_this_page = NULL;
1218
1219 nr_unused_buffer_heads++;
1220 bh->b_next_free = unused_list;
1221 unused_list = bh;
1222 }
1223 }
1224
put_unused_buffer_head(struct buffer_head * bh)1225 void put_unused_buffer_head(struct buffer_head *bh)
1226 {
1227 spin_lock(&unused_list_lock);
1228 __put_unused_buffer_head(bh);
1229 spin_unlock(&unused_list_lock);
1230 }
1231 EXPORT_SYMBOL(put_unused_buffer_head);
1232
1233 /*
1234 * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1235 * no-buffer-head deadlock. Return NULL on failure; waiting for
1236 * buffer heads is now handled in create_buffers().
1237 */
get_unused_buffer_head(int async)1238 struct buffer_head * get_unused_buffer_head(int async)
1239 {
1240 struct buffer_head * bh;
1241
1242 spin_lock(&unused_list_lock);
1243 if (nr_unused_buffer_heads > NR_RESERVED) {
1244 bh = unused_list;
1245 unused_list = bh->b_next_free;
1246 nr_unused_buffer_heads--;
1247 spin_unlock(&unused_list_lock);
1248 return bh;
1249 }
1250 spin_unlock(&unused_list_lock);
1251
1252 /* This is critical. We can't call out to the FS
1253 * to get more buffer heads, because the FS may need
1254 * more buffer-heads itself. Thus SLAB_NOFS.
1255 */
1256 if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1257 bh->b_blocknr = -1;
1258 bh->b_this_page = NULL;
1259 return bh;
1260 }
1261
1262 /*
1263 * If we need an async buffer, use the reserved buffer heads.
1264 * Non-PF_MEMALLOC tasks can just loop in create_buffers().
1265 */
1266 if (async && (current->flags & PF_MEMALLOC)) {
1267 spin_lock(&unused_list_lock);
1268 if (unused_list) {
1269 bh = unused_list;
1270 unused_list = bh->b_next_free;
1271 nr_unused_buffer_heads--;
1272 spin_unlock(&unused_list_lock);
1273 return bh;
1274 }
1275 spin_unlock(&unused_list_lock);
1276 }
1277
1278 return NULL;
1279 }
1280 EXPORT_SYMBOL(get_unused_buffer_head);
1281
set_bh_page(struct buffer_head * bh,struct page * page,unsigned long offset)1282 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1283 {
1284 if (offset >= PAGE_SIZE)
1285 BUG();
1286
1287 if (PageHighMem(page)) {
1288 bh->b_data = (char *)offset;
1289 } else {
1290 bh->b_data = page_address(page) + offset;
1291 }
1292 bh->b_page = page;
1293 }
1294 EXPORT_SYMBOL(set_bh_page);
1295
1296 /*
1297 * Create the appropriate buffers when given a page for data area and
1298 * the size of each buffer.. Use the bh->b_this_page linked list to
1299 * follow the buffers created. Return NULL if unable to create more
1300 * buffers.
1301 * The async flag is used to differentiate async IO (paging, swapping)
1302 * from ordinary buffer allocations, and only async requests are allowed
1303 * to sleep waiting for buffer heads.
1304 */
create_buffers(struct page * page,unsigned long size,int async)1305 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1306 {
1307 struct buffer_head *bh, *head;
1308 long offset;
1309
1310 try_again:
1311 head = NULL;
1312 offset = PAGE_SIZE;
1313 while ((offset -= size) >= 0) {
1314 bh = get_unused_buffer_head(async);
1315 if (!bh)
1316 goto no_grow;
1317
1318 bh->b_dev = NODEV;
1319 bh->b_this_page = head;
1320 head = bh;
1321
1322 bh->b_state = 0;
1323 bh->b_next_free = NULL;
1324 bh->b_pprev = NULL;
1325 atomic_set(&bh->b_count, 0);
1326 bh->b_size = size;
1327
1328 set_bh_page(bh, page, offset);
1329
1330 bh->b_list = BUF_CLEAN;
1331 bh->b_end_io = NULL;
1332 }
1333 return head;
1334 /*
1335 * In case anything failed, we just free everything we got.
1336 */
1337 no_grow:
1338 if (head) {
1339 spin_lock(&unused_list_lock);
1340 do {
1341 bh = head;
1342 head = head->b_this_page;
1343 __put_unused_buffer_head(bh);
1344 } while (head);
1345 spin_unlock(&unused_list_lock);
1346
1347 /* Wake up any waiters ... */
1348 wake_up(&buffer_wait);
1349 }
1350
1351 /*
1352 * Return failure for non-async IO requests. Async IO requests
1353 * are not allowed to fail, so we have to wait until buffer heads
1354 * become available. But we don't want tasks sleeping with
1355 * partially complete buffers, so all were released above.
1356 */
1357 if (!async)
1358 return NULL;
1359
1360 /* We're _really_ low on memory. Now we just
1361 * wait for old buffer heads to become free due to
1362 * finishing IO. Since this is an async request and
1363 * the reserve list is empty, we're sure there are
1364 * async buffer heads in use.
1365 */
1366 run_task_queue(&tq_disk);
1367
1368 free_more_memory();
1369 goto try_again;
1370 }
1371
1372 /*
1373 * Called when truncating a buffer on a page completely.
1374 */
discard_buffer(struct buffer_head * bh)1375 static void discard_buffer(struct buffer_head * bh)
1376 {
1377 if (buffer_mapped(bh) || buffer_delay(bh)) {
1378 mark_buffer_clean(bh);
1379 lock_buffer(bh);
1380 clear_bit(BH_Uptodate, &bh->b_state);
1381 clear_bit(BH_Mapped, &bh->b_state);
1382 clear_bit(BH_Req, &bh->b_state);
1383 clear_bit(BH_New, &bh->b_state);
1384 clear_bit(BH_Delay, &bh->b_state);
1385 remove_from_queues(bh);
1386 unlock_buffer(bh);
1387 }
1388 }
1389
1390 /**
1391 * try_to_release_page - release old fs-specific metadata on a page
1392 *
1393 */
1394
try_to_release_page(struct page * page,int gfp_mask)1395 int try_to_release_page(struct page * page, int gfp_mask)
1396 {
1397 if (!PageLocked(page))
1398 BUG();
1399
1400 if (!page->mapping)
1401 goto try_to_free;
1402 if (!page->mapping->a_ops->releasepage)
1403 goto try_to_free;
1404 if (page->mapping->a_ops->releasepage(page, gfp_mask))
1405 goto try_to_free;
1406 /*
1407 * We couldn't release buffer metadata; don't even bother trying
1408 * to release buffers.
1409 */
1410 return 0;
1411 try_to_free:
1412 return try_to_free_buffers(page, gfp_mask);
1413 }
1414
1415 /*
1416 * We don't have to release all buffers here, but
1417 * we have to be sure that no dirty buffer is left
1418 * and no IO is going on (no buffer is locked), because
1419 * we have truncated the file and are going to free the
1420 * blocks on-disk..
1421 */
discard_bh_page(struct page * page,unsigned long offset,int drop_pagecache)1422 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1423 {
1424 struct buffer_head *head, *bh, *next;
1425 unsigned int curr_off = 0;
1426
1427 if (!PageLocked(page))
1428 BUG();
1429 if (!page->buffers)
1430 return 1;
1431
1432 head = page->buffers;
1433 bh = head;
1434 do {
1435 unsigned int next_off = curr_off + bh->b_size;
1436 next = bh->b_this_page;
1437
1438 /*
1439 * is this block fully flushed?
1440 */
1441 if (offset <= curr_off)
1442 discard_buffer(bh);
1443 curr_off = next_off;
1444 bh = next;
1445 } while (bh != head);
1446
1447 /*
1448 * subtle. We release buffer-heads only if this is
1449 * the 'final' flushpage. We have invalidated the get_block
1450 * cached value unconditionally, so real IO is not
1451 * possible anymore.
1452 *
1453 * If the free doesn't work out, the buffers can be
1454 * left around - they just turn into anonymous buffers
1455 * instead.
1456 */
1457 if (!offset) {
1458 if (!try_to_release_page(page, 0))
1459 return 0;
1460 }
1461
1462 return 1;
1463 }
1464
create_empty_buffers(struct page * page,kdev_t dev,unsigned long blocksize)1465 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1466 {
1467 struct buffer_head *bh, *head, *tail;
1468
1469 /* FIXME: create_buffers should fail if there's no enough memory */
1470 head = create_buffers(page, blocksize, 1);
1471 if (page->buffers)
1472 BUG();
1473
1474 bh = head;
1475 do {
1476 bh->b_dev = dev;
1477 bh->b_blocknr = 0;
1478 bh->b_end_io = NULL;
1479 tail = bh;
1480 bh = bh->b_this_page;
1481 } while (bh);
1482 tail->b_this_page = head;
1483 page->buffers = head;
1484 page_cache_get(page);
1485 }
1486 EXPORT_SYMBOL(create_empty_buffers);
1487
1488 /*
1489 * We are taking a block for data and we don't want any output from any
1490 * buffer-cache aliases starting from return from that function and
1491 * until the moment when something will explicitly mark the buffer
1492 * dirty (hopefully that will not happen until we will free that block ;-)
1493 * We don't even need to mark it not-uptodate - nobody can expect
1494 * anything from a newly allocated buffer anyway. We used to used
1495 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1496 * don't want to mark the alias unmapped, for example - it would confuse
1497 * anyone who might pick it with bread() afterwards...
1498 */
1499
unmap_underlying_metadata(struct buffer_head * bh)1500 static void unmap_underlying_metadata(struct buffer_head * bh)
1501 {
1502 struct buffer_head *old_bh;
1503
1504 old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1505 if (old_bh) {
1506 mark_buffer_clean(old_bh);
1507 wait_on_buffer(old_bh);
1508 clear_bit(BH_Req, &old_bh->b_state);
1509 __brelse(old_bh);
1510 }
1511 }
1512
1513 /*
1514 * NOTE! All mapped/uptodate combinations are valid:
1515 *
1516 * Mapped Uptodate Meaning
1517 *
1518 * No No "unknown" - must do get_block()
1519 * No Yes "hole" - zero-filled
1520 * Yes No "allocated" - allocated on disk, not read in
1521 * Yes Yes "valid" - allocated and up-to-date in memory.
1522 *
1523 * "Dirty" is valid only with the last case (mapped+uptodate).
1524 */
1525
1526 /*
1527 * block_write_full_page() is SMP threaded - the kernel lock is not held.
1528 */
__block_write_full_page(struct inode * inode,struct page * page,get_block_t * get_block)1529 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1530 {
1531 int err, i;
1532 unsigned long block;
1533 struct buffer_head *bh, *head;
1534 int need_unlock;
1535
1536 if (!PageLocked(page))
1537 BUG();
1538
1539 if (!page->buffers)
1540 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1541 head = page->buffers;
1542
1543 block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1544
1545 bh = head;
1546 i = 0;
1547
1548 /* Stage 1: make sure we have all the buffers mapped! */
1549 do {
1550 /*
1551 * If the buffer isn't up-to-date, we can't be sure
1552 * that the buffer has been initialized with the proper
1553 * block number information etc..
1554 *
1555 * Leave it to the low-level FS to make all those
1556 * decisions (block #0 may actually be a valid block)
1557 */
1558 if (!buffer_mapped(bh)) {
1559 err = get_block(inode, block, bh, 1);
1560 if (err)
1561 goto out;
1562 if (buffer_new(bh))
1563 unmap_underlying_metadata(bh);
1564 }
1565 bh = bh->b_this_page;
1566 block++;
1567 } while (bh != head);
1568
1569 /* Stage 2: lock the buffers, mark them clean */
1570 do {
1571 lock_buffer(bh);
1572 set_buffer_async_io(bh);
1573 set_bit(BH_Uptodate, &bh->b_state);
1574 clear_bit(BH_Dirty, &bh->b_state);
1575 bh = bh->b_this_page;
1576 } while (bh != head);
1577
1578 /* Stage 3: submit the IO */
1579 do {
1580 struct buffer_head *next = bh->b_this_page;
1581 submit_bh(WRITE, bh);
1582 bh = next;
1583 } while (bh != head);
1584
1585 /* Done - end_buffer_io_async will unlock */
1586 SetPageUptodate(page);
1587
1588 wakeup_page_waiters(page);
1589
1590 return 0;
1591
1592 out:
1593 /*
1594 * ENOSPC, or some other error. We may already have added some
1595 * blocks to the file, so we need to write these out to avoid
1596 * exposing stale data.
1597 */
1598 ClearPageUptodate(page);
1599 bh = head;
1600 need_unlock = 1;
1601 /* Recovery: lock and submit the mapped buffers */
1602 do {
1603 if (buffer_mapped(bh)) {
1604 lock_buffer(bh);
1605 set_buffer_async_io(bh);
1606 need_unlock = 0;
1607 }
1608 bh = bh->b_this_page;
1609 } while (bh != head);
1610 do {
1611 struct buffer_head *next = bh->b_this_page;
1612 if (buffer_mapped(bh)) {
1613 set_bit(BH_Uptodate, &bh->b_state);
1614 clear_bit(BH_Dirty, &bh->b_state);
1615 submit_bh(WRITE, bh);
1616 }
1617 bh = next;
1618 } while (bh != head);
1619 if (need_unlock)
1620 UnlockPage(page);
1621 wakeup_page_waiters(page);
1622 return err;
1623 }
1624
__block_prepare_write(struct inode * inode,struct page * page,unsigned from,unsigned to,get_block_t * get_block)1625 static int __block_prepare_write(struct inode *inode, struct page *page,
1626 unsigned from, unsigned to, get_block_t *get_block)
1627 {
1628 unsigned block_start, block_end;
1629 unsigned long block;
1630 int err = 0;
1631 unsigned blocksize, bbits;
1632 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1633 char *kaddr = kmap(page);
1634
1635 blocksize = 1 << inode->i_blkbits;
1636 if (!page->buffers)
1637 create_empty_buffers(page, inode->i_dev, blocksize);
1638 head = page->buffers;
1639
1640 bbits = inode->i_blkbits;
1641 block = page->index << (PAGE_CACHE_SHIFT - bbits);
1642
1643 for(bh = head, block_start = 0; bh != head || !block_start;
1644 block++, block_start=block_end, bh = bh->b_this_page) {
1645 if (!bh)
1646 BUG();
1647 block_end = block_start+blocksize;
1648 if (block_end <= from)
1649 continue;
1650 if (block_start >= to)
1651 break;
1652 clear_bit(BH_New, &bh->b_state);
1653 if (!buffer_mapped(bh)) {
1654 err = get_block(inode, block, bh, 1);
1655 if (err)
1656 goto out;
1657 if (buffer_new(bh)) {
1658 unmap_underlying_metadata(bh);
1659 if (Page_Uptodate(page)) {
1660 set_bit(BH_Uptodate, &bh->b_state);
1661 continue;
1662 }
1663 if (block_end > to)
1664 memset(kaddr+to, 0, block_end-to);
1665 if (block_start < from)
1666 memset(kaddr+block_start, 0, from-block_start);
1667 if (block_end > to || block_start < from)
1668 flush_dcache_page(page);
1669 continue;
1670 }
1671 }
1672 if (Page_Uptodate(page)) {
1673 set_bit(BH_Uptodate, &bh->b_state);
1674 continue;
1675 }
1676 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1677 (block_start < from || block_end > to)) {
1678 ll_rw_block(READ, 1, &bh);
1679 *wait_bh++=bh;
1680 }
1681 }
1682 /*
1683 * If we issued read requests - let them complete.
1684 */
1685 while(wait_bh > wait) {
1686 wait_on_buffer(*--wait_bh);
1687 if (!buffer_uptodate(*wait_bh))
1688 return -EIO;
1689 }
1690 return 0;
1691 out:
1692 /*
1693 * Zero out any newly allocated blocks to avoid exposing stale
1694 * data. If BH_New is set, we know that the block was newly
1695 * allocated in the above loop.
1696 *
1697 * Details the buffer can be new and uptodate because:
1698 * 1) hole in uptodate page, get_block(create) allocate the block,
1699 * so the buffer is new and additionally we also mark it uptodate
1700 * 2) The buffer is not mapped and uptodate due a previous partial read.
1701 *
1702 * We can always ignore uptodate buffers here, if you mark a buffer
1703 * uptodate you must make sure it contains the right data first.
1704 *
1705 * We must stop the "undo/clear" fixup pass not at the caller "to"
1706 * but at the last block that we successfully arrived in the main loop.
1707 */
1708 bh = head;
1709 to = block_start; /* stop at the last successfully handled block */
1710 block_start = 0;
1711 do {
1712 block_end = block_start+blocksize;
1713 if (block_end <= from)
1714 goto next_bh;
1715 if (block_start >= to)
1716 break;
1717 if (buffer_new(bh) && !buffer_uptodate(bh)) {
1718 memset(kaddr+block_start, 0, bh->b_size);
1719 flush_dcache_page(page);
1720 set_bit(BH_Uptodate, &bh->b_state);
1721 mark_buffer_dirty(bh);
1722 }
1723 next_bh:
1724 block_start = block_end;
1725 bh = bh->b_this_page;
1726 } while (bh != head);
1727 return err;
1728 }
1729
__block_commit_write(struct inode * inode,struct page * page,unsigned from,unsigned to)1730 static int __block_commit_write(struct inode *inode, struct page *page,
1731 unsigned from, unsigned to)
1732 {
1733 unsigned block_start, block_end;
1734 int partial = 0, need_balance_dirty = 0;
1735 unsigned blocksize;
1736 struct buffer_head *bh, *head;
1737
1738 blocksize = 1 << inode->i_blkbits;
1739
1740 for(bh = head = page->buffers, block_start = 0;
1741 bh != head || !block_start;
1742 block_start=block_end, bh = bh->b_this_page) {
1743 block_end = block_start + blocksize;
1744 if (block_end <= from || block_start >= to) {
1745 if (!buffer_uptodate(bh))
1746 partial = 1;
1747 } else {
1748 set_bit(BH_Uptodate, &bh->b_state);
1749 if (!atomic_set_buffer_dirty(bh)) {
1750 __mark_dirty(bh);
1751 buffer_insert_inode_data_queue(bh, inode);
1752 need_balance_dirty = 1;
1753 }
1754 }
1755 }
1756
1757 if (need_balance_dirty)
1758 balance_dirty();
1759 /*
1760 * is this a partial write that happened to make all buffers
1761 * uptodate then we can optimize away a bogus readpage() for
1762 * the next read(). Here we 'discover' wether the page went
1763 * uptodate as a result of this (potentially partial) write.
1764 */
1765 if (!partial)
1766 SetPageUptodate(page);
1767 return 0;
1768 }
1769
1770 /*
1771 * Generic "read page" function for block devices that have the normal
1772 * get_block functionality. This is most of the block device filesystems.
1773 * Reads the page asynchronously --- the unlock_buffer() and
1774 * mark_buffer_uptodate() functions propagate buffer state into the
1775 * page struct once IO has completed.
1776 */
block_read_full_page(struct page * page,get_block_t * get_block)1777 int block_read_full_page(struct page *page, get_block_t *get_block)
1778 {
1779 struct inode *inode = page->mapping->host;
1780 unsigned long iblock, lblock;
1781 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1782 unsigned int blocksize, blocks;
1783 int nr, i;
1784
1785 if (!PageLocked(page))
1786 PAGE_BUG(page);
1787 blocksize = 1 << inode->i_blkbits;
1788 if (!page->buffers)
1789 create_empty_buffers(page, inode->i_dev, blocksize);
1790 head = page->buffers;
1791
1792 blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1793 iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794 lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1795 bh = head;
1796 nr = 0;
1797 i = 0;
1798
1799 do {
1800 if (buffer_uptodate(bh))
1801 continue;
1802
1803 if (!buffer_mapped(bh)) {
1804 if (iblock < lblock) {
1805 if (get_block(inode, iblock, bh, 0))
1806 SetPageError(page);
1807 }
1808 if (!buffer_mapped(bh)) {
1809 memset(kmap(page) + i*blocksize, 0, blocksize);
1810 flush_dcache_page(page);
1811 kunmap(page);
1812 set_bit(BH_Uptodate, &bh->b_state);
1813 continue;
1814 }
1815 /* get_block() might have updated the buffer synchronously */
1816 if (buffer_uptodate(bh))
1817 continue;
1818 }
1819
1820 arr[nr] = bh;
1821 nr++;
1822 } while (i++, iblock++, (bh = bh->b_this_page) != head);
1823
1824 if (!nr) {
1825 /*
1826 * All buffers are uptodate - we can set the page uptodate
1827 * as well. But not if get_block() returned an error.
1828 */
1829 if (!PageError(page))
1830 SetPageUptodate(page);
1831 UnlockPage(page);
1832 return 0;
1833 }
1834
1835 /* Stage two: lock the buffers */
1836 for (i = 0; i < nr; i++) {
1837 struct buffer_head * bh = arr[i];
1838 lock_buffer(bh);
1839 set_buffer_async_io(bh);
1840 }
1841
1842 /* Stage 3: start the IO */
1843 for (i = 0; i < nr; i++) {
1844 struct buffer_head * bh = arr[i];
1845 if (buffer_uptodate(bh))
1846 end_buffer_io_async(bh, 1);
1847 else
1848 submit_bh(READ, bh);
1849 }
1850
1851 wakeup_page_waiters(page);
1852
1853 return 0;
1854 }
1855
1856 /* utility function for filesystems that need to do work on expanding
1857 * truncates. Uses prepare/commit_write to allow the filesystem to
1858 * deal with the hole.
1859 */
generic_cont_expand(struct inode * inode,loff_t size)1860 int generic_cont_expand(struct inode *inode, loff_t size)
1861 {
1862 struct address_space *mapping = inode->i_mapping;
1863 struct page *page;
1864 unsigned long index, offset, limit;
1865 int err;
1866
1867 err = -EFBIG;
1868 limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1869 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1870 send_sig(SIGXFSZ, current, 0);
1871 goto out;
1872 }
1873 if (size > inode->i_sb->s_maxbytes)
1874 goto out;
1875
1876 offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1877
1878 /* ugh. in prepare/commit_write, if from==to==start of block, we
1879 ** skip the prepare. make sure we never send an offset for the start
1880 ** of a block
1881 */
1882 if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1883 offset++;
1884 }
1885 index = size >> PAGE_CACHE_SHIFT;
1886 err = -ENOMEM;
1887 page = grab_cache_page(mapping, index);
1888 if (!page)
1889 goto out;
1890 err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1891 if (!err) {
1892 err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1893 }
1894 UnlockPage(page);
1895 page_cache_release(page);
1896 if (err > 0)
1897 err = 0;
1898 out:
1899 return err;
1900 }
1901
1902 /*
1903 * For moronic filesystems that do not allow holes in file.
1904 * We may have to extend the file.
1905 */
1906
cont_prepare_write(struct page * page,unsigned offset,unsigned to,get_block_t * get_block,unsigned long * bytes)1907 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1908 {
1909 struct address_space *mapping = page->mapping;
1910 struct inode *inode = mapping->host;
1911 struct page *new_page;
1912 unsigned long pgpos;
1913 long status;
1914 unsigned zerofrom;
1915 unsigned blocksize = 1 << inode->i_blkbits;
1916 char *kaddr;
1917
1918 while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1919 status = -ENOMEM;
1920 new_page = grab_cache_page(mapping, pgpos);
1921 if (!new_page)
1922 goto out;
1923 /* we might sleep */
1924 if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1925 UnlockPage(new_page);
1926 page_cache_release(new_page);
1927 continue;
1928 }
1929 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1930 if (zerofrom & (blocksize-1)) {
1931 *bytes |= (blocksize-1);
1932 (*bytes)++;
1933 }
1934 status = __block_prepare_write(inode, new_page, zerofrom,
1935 PAGE_CACHE_SIZE, get_block);
1936 if (status)
1937 goto out_unmap;
1938 kaddr = page_address(new_page);
1939 memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1940 flush_dcache_page(new_page);
1941 __block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1942 kunmap(new_page);
1943 UnlockPage(new_page);
1944 page_cache_release(new_page);
1945 }
1946
1947 if (page->index < pgpos) {
1948 /* completely inside the area */
1949 zerofrom = offset;
1950 } else {
1951 /* page covers the boundary, find the boundary offset */
1952 zerofrom = *bytes & ~PAGE_CACHE_MASK;
1953
1954 /* if we will expand the thing last block will be filled */
1955 if (to > zerofrom && (zerofrom & (blocksize-1))) {
1956 *bytes |= (blocksize-1);
1957 (*bytes)++;
1958 }
1959
1960 /* starting below the boundary? Nothing to zero out */
1961 if (offset <= zerofrom)
1962 zerofrom = offset;
1963 }
1964 status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1965 if (status)
1966 goto out1;
1967 kaddr = page_address(page);
1968 if (zerofrom < offset) {
1969 memset(kaddr+zerofrom, 0, offset-zerofrom);
1970 flush_dcache_page(page);
1971 __block_commit_write(inode, page, zerofrom, offset);
1972 }
1973 return 0;
1974 out1:
1975 ClearPageUptodate(page);
1976 kunmap(page);
1977 return status;
1978
1979 out_unmap:
1980 ClearPageUptodate(new_page);
1981 kunmap(new_page);
1982 UnlockPage(new_page);
1983 page_cache_release(new_page);
1984 out:
1985 return status;
1986 }
1987
block_prepare_write(struct page * page,unsigned from,unsigned to,get_block_t * get_block)1988 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1989 get_block_t *get_block)
1990 {
1991 struct inode *inode = page->mapping->host;
1992 int err = __block_prepare_write(inode, page, from, to, get_block);
1993 if (err) {
1994 ClearPageUptodate(page);
1995 kunmap(page);
1996 }
1997 return err;
1998 }
1999
block_commit_write(struct page * page,unsigned from,unsigned to)2000 int block_commit_write(struct page *page, unsigned from, unsigned to)
2001 {
2002 struct inode *inode = page->mapping->host;
2003 __block_commit_write(inode,page,from,to);
2004 kunmap(page);
2005 return 0;
2006 }
2007
generic_commit_write(struct file * file,struct page * page,unsigned from,unsigned to)2008 int generic_commit_write(struct file *file, struct page *page,
2009 unsigned from, unsigned to)
2010 {
2011 struct inode *inode = page->mapping->host;
2012 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2013 __block_commit_write(inode,page,from,to);
2014 kunmap(page);
2015 if (pos > inode->i_size) {
2016 inode->i_size = pos;
2017 mark_inode_dirty(inode);
2018 }
2019 return 0;
2020 }
2021
block_truncate_page(struct address_space * mapping,loff_t from,get_block_t * get_block)2022 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2023 {
2024 unsigned long index = from >> PAGE_CACHE_SHIFT;
2025 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2026 unsigned blocksize, iblock, length, pos;
2027 struct inode *inode = mapping->host;
2028 struct page *page;
2029 struct buffer_head *bh;
2030 int err;
2031
2032 blocksize = 1 << inode->i_blkbits;
2033 length = offset & (blocksize - 1);
2034
2035 /* Block boundary? Nothing to do */
2036 if (!length)
2037 return 0;
2038
2039 length = blocksize - length;
2040 iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2041
2042 page = grab_cache_page(mapping, index);
2043 err = -ENOMEM;
2044 if (!page)
2045 goto out;
2046
2047 if (!page->buffers)
2048 create_empty_buffers(page, inode->i_dev, blocksize);
2049
2050 /* Find the buffer that contains "offset" */
2051 bh = page->buffers;
2052 pos = blocksize;
2053 while (offset >= pos) {
2054 bh = bh->b_this_page;
2055 iblock++;
2056 pos += blocksize;
2057 }
2058
2059 err = 0;
2060 if (!buffer_mapped(bh)) {
2061 /* Hole? Nothing to do */
2062 if (buffer_uptodate(bh))
2063 goto unlock;
2064 get_block(inode, iblock, bh, 0);
2065 /* Still unmapped? Nothing to do */
2066 if (!buffer_mapped(bh))
2067 goto unlock;
2068 }
2069
2070 /* Ok, it's mapped. Make sure it's up-to-date */
2071 if (Page_Uptodate(page))
2072 set_bit(BH_Uptodate, &bh->b_state);
2073
2074 if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2075 err = -EIO;
2076 ll_rw_block(READ, 1, &bh);
2077 wait_on_buffer(bh);
2078 /* Uhhuh. Read error. Complain and punt. */
2079 if (!buffer_uptodate(bh))
2080 goto unlock;
2081 }
2082
2083 memset(kmap(page) + offset, 0, length);
2084 flush_dcache_page(page);
2085 kunmap(page);
2086
2087 if (!atomic_set_buffer_dirty(bh)) {
2088 __mark_dirty(bh);
2089 buffer_insert_inode_data_queue(bh, inode);
2090 balance_dirty();
2091 }
2092
2093 err = 0;
2094
2095 unlock:
2096 UnlockPage(page);
2097 page_cache_release(page);
2098 out:
2099 return err;
2100 }
2101
block_write_full_page(struct page * page,get_block_t * get_block)2102 int block_write_full_page(struct page *page, get_block_t *get_block)
2103 {
2104 struct inode *inode = page->mapping->host;
2105 unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2106 unsigned offset;
2107 int err;
2108
2109 /* easy case */
2110 if (page->index < end_index)
2111 return __block_write_full_page(inode, page, get_block);
2112
2113 /* things got complicated... */
2114 offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2115 /* OK, are we completely out? */
2116 if (page->index >= end_index+1 || !offset) {
2117 UnlockPage(page);
2118 return -EIO;
2119 }
2120
2121 /* Sigh... will have to work, then... */
2122 err = __block_prepare_write(inode, page, 0, offset, get_block);
2123 if (!err) {
2124 memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2125 flush_dcache_page(page);
2126 __block_commit_write(inode,page,0,offset);
2127 done:
2128 kunmap(page);
2129 UnlockPage(page);
2130 return err;
2131 }
2132 ClearPageUptodate(page);
2133 goto done;
2134 }
2135
2136 /*
2137 * Commence writeout of all the buffers against a page. The
2138 * page must be locked. Returns zero on success or a negative
2139 * errno.
2140 */
writeout_one_page(struct page * page)2141 int writeout_one_page(struct page *page)
2142 {
2143 struct buffer_head *bh, *head = page->buffers;
2144
2145 if (!PageLocked(page))
2146 BUG();
2147 bh = head;
2148 do {
2149 if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2150 continue;
2151
2152 bh->b_flushtime = jiffies;
2153 ll_rw_block(WRITE, 1, &bh);
2154 } while ((bh = bh->b_this_page) != head);
2155 return 0;
2156 }
2157 EXPORT_SYMBOL(writeout_one_page);
2158
2159 /*
2160 * Wait for completion of I/O of all buffers against a page. The page
2161 * must be locked. Returns zero on success or a negative errno.
2162 */
waitfor_one_page(struct page * page)2163 int waitfor_one_page(struct page *page)
2164 {
2165 int error = 0;
2166 struct buffer_head *bh, *head = page->buffers;
2167
2168 bh = head;
2169 do {
2170 wait_on_buffer(bh);
2171 if (buffer_req(bh) && !buffer_uptodate(bh))
2172 error = -EIO;
2173 } while ((bh = bh->b_this_page) != head);
2174 return error;
2175 }
2176 EXPORT_SYMBOL(waitfor_one_page);
2177
generic_block_bmap(struct address_space * mapping,long block,get_block_t * get_block)2178 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2179 {
2180 struct buffer_head tmp;
2181 struct inode *inode = mapping->host;
2182 tmp.b_state = 0;
2183 tmp.b_blocknr = 0;
2184 get_block(inode, block, &tmp, 0);
2185 return tmp.b_blocknr;
2186 }
2187
generic_direct_IO(int rw,struct inode * inode,struct kiobuf * iobuf,unsigned long blocknr,int blocksize,get_block_t * get_block)2188 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2189 {
2190 int i, nr_blocks, retval;
2191 unsigned long * blocks = iobuf->blocks;
2192 int length;
2193 int beyond_eof = 0;
2194
2195 length = iobuf->length;
2196 nr_blocks = length / blocksize;
2197 /* build the blocklist */
2198 for (i = 0; i < nr_blocks; i++, blocknr++) {
2199 struct buffer_head bh;
2200
2201 bh.b_state = 0;
2202 bh.b_dev = inode->i_dev;
2203 bh.b_size = blocksize;
2204 bh.b_page = NULL;
2205
2206 if (((loff_t) blocknr) * blocksize >= inode->i_size)
2207 beyond_eof = 1;
2208
2209 /* Only allow get_block to create new blocks if we are safely
2210 beyond EOF. O_DIRECT is unsafe inside sparse files. */
2211 retval = get_block(inode, blocknr, &bh,
2212 ((rw != READ) && beyond_eof));
2213
2214 if (retval) {
2215 if (!i)
2216 /* report error to userspace */
2217 goto out;
2218 else
2219 /* do short I/O until 'i' */
2220 break;
2221 }
2222
2223 if (rw == READ) {
2224 if (buffer_new(&bh))
2225 BUG();
2226 if (!buffer_mapped(&bh)) {
2227 /* there was an hole in the filesystem */
2228 blocks[i] = -1UL;
2229 continue;
2230 }
2231 } else {
2232 if (buffer_new(&bh))
2233 unmap_underlying_metadata(&bh);
2234 if (!buffer_mapped(&bh))
2235 /* upper layers need to pass the error on or
2236 * fall back to buffered IO. */
2237 return -ENOTBLK;
2238 }
2239 blocks[i] = bh.b_blocknr;
2240 }
2241
2242 /* patch length to handle short I/O */
2243 iobuf->length = i * blocksize;
2244 if (!beyond_eof)
2245 up(&inode->i_sem);
2246 retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2247 if (!beyond_eof)
2248 down(&inode->i_sem);
2249 /* restore orig length */
2250 iobuf->length = length;
2251 out:
2252
2253 return retval;
2254 }
2255
2256 /*
2257 * IO completion routine for a buffer_head being used for kiobuf IO: we
2258 * can't dispatch the kiobuf callback until io_count reaches 0.
2259 */
2260
end_buffer_io_kiobuf(struct buffer_head * bh,int uptodate)2261 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2262 {
2263 struct kiobuf *kiobuf;
2264
2265 mark_buffer_uptodate(bh, uptodate);
2266
2267 kiobuf = bh->b_private;
2268 end_kio_request(kiobuf, uptodate);
2269 unlock_buffer(bh);
2270 }
2271
2272 /*
2273 * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2274 * for them to complete. Clean up the buffer_heads afterwards.
2275 */
2276
wait_kio(int rw,int nr,struct buffer_head * bh[],int size)2277 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2278 {
2279 int iosize, err;
2280 int i;
2281 struct buffer_head *tmp;
2282
2283 iosize = 0;
2284 err = 0;
2285
2286 for (i = nr; --i >= 0; ) {
2287 iosize += size;
2288 tmp = bh[i];
2289 wait_on_buffer(tmp);
2290
2291 if (!buffer_uptodate(tmp)) {
2292 /* We are traversing bh'es in reverse order so
2293 clearing iosize on error calculates the
2294 amount of IO before the first error. */
2295 iosize = 0;
2296 err = -EIO;
2297 }
2298 }
2299
2300 if (iosize)
2301 return iosize;
2302 return err;
2303 }
2304
2305 /*
2306 * Start I/O on a physical range of kernel memory, defined by a vector
2307 * of kiobuf structs (much like a user-space iovec list).
2308 *
2309 * The kiobuf must already be locked for IO. IO is submitted
2310 * asynchronously: you need to check page->locked and page->uptodate.
2311 *
2312 * It is up to the caller to make sure that there are enough blocks
2313 * passed in to completely map the iobufs to disk.
2314 */
2315
brw_kiovec(int rw,int nr,struct kiobuf * iovec[],kdev_t dev,unsigned long b[],int size)2316 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2317 kdev_t dev, unsigned long b[], int size)
2318 {
2319 int err;
2320 int length;
2321 int transferred;
2322 int i;
2323 int bufind;
2324 int pageind;
2325 int bhind;
2326 int offset;
2327 unsigned long blocknr;
2328 struct kiobuf * iobuf = NULL;
2329 struct page * map;
2330 struct buffer_head *tmp, **bhs = NULL;
2331
2332 if (!nr)
2333 return 0;
2334
2335 /*
2336 * First, do some alignment and validity checks
2337 */
2338 for (i = 0; i < nr; i++) {
2339 iobuf = iovec[i];
2340 if ((iobuf->offset & (size-1)) ||
2341 (iobuf->length & (size-1)))
2342 return -EINVAL;
2343 if (!iobuf->nr_pages)
2344 panic("brw_kiovec: iobuf not initialised");
2345 }
2346
2347 /*
2348 * OK to walk down the iovec doing page IO on each page we find.
2349 */
2350 bufind = bhind = transferred = err = 0;
2351 for (i = 0; i < nr; i++) {
2352 iobuf = iovec[i];
2353 offset = iobuf->offset;
2354 length = iobuf->length;
2355 iobuf->errno = 0;
2356 if (!bhs)
2357 bhs = iobuf->bh;
2358
2359 for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2360 map = iobuf->maplist[pageind];
2361 if (!map) {
2362 err = -EFAULT;
2363 goto finished;
2364 }
2365
2366 while (length > 0) {
2367 blocknr = b[bufind++];
2368 if (blocknr == -1UL) {
2369 if (rw == READ) {
2370 /* there was an hole in the filesystem */
2371 memset(kmap(map) + offset, 0, size);
2372 flush_dcache_page(map);
2373 kunmap(map);
2374
2375 transferred += size;
2376 goto skip_block;
2377 } else
2378 BUG();
2379 }
2380 tmp = bhs[bhind++];
2381
2382 tmp->b_size = size;
2383 set_bh_page(tmp, map, offset);
2384 tmp->b_this_page = tmp;
2385
2386 init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2387 tmp->b_dev = dev;
2388 tmp->b_blocknr = blocknr;
2389 tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2390
2391 if (rw == WRITE) {
2392 set_bit(BH_Uptodate, &tmp->b_state);
2393 clear_bit(BH_Dirty, &tmp->b_state);
2394 } else
2395 set_bit(BH_Uptodate, &tmp->b_state);
2396
2397 atomic_inc(&iobuf->io_count);
2398 submit_bh(rw, tmp);
2399 /*
2400 * Wait for IO if we have got too much
2401 */
2402 if (bhind >= KIO_MAX_SECTORS) {
2403 kiobuf_wait_for_io(iobuf); /* wake-one */
2404 err = wait_kio(rw, bhind, bhs, size);
2405 if (err >= 0)
2406 transferred += err;
2407 else
2408 goto finished;
2409 bhind = 0;
2410 }
2411
2412 skip_block:
2413 length -= size;
2414 offset += size;
2415
2416 if (offset >= PAGE_SIZE) {
2417 offset = 0;
2418 break;
2419 }
2420 } /* End of block loop */
2421 } /* End of page loop */
2422 } /* End of iovec loop */
2423
2424 /* Is there any IO still left to submit? */
2425 if (bhind) {
2426 kiobuf_wait_for_io(iobuf); /* wake-one */
2427 err = wait_kio(rw, bhind, bhs, size);
2428 if (err >= 0)
2429 transferred += err;
2430 else
2431 goto finished;
2432 }
2433
2434 finished:
2435 if (transferred)
2436 return transferred;
2437 return err;
2438 }
2439
2440 /*
2441 * Start I/O on a page.
2442 * This function expects the page to be locked and may return
2443 * before I/O is complete. You then have to check page->locked
2444 * and page->uptodate.
2445 *
2446 * brw_page() is SMP-safe, although it's being called with the
2447 * kernel lock held - but the code is ready.
2448 *
2449 * FIXME: we need a swapper_inode->get_block function to remove
2450 * some of the bmap kludges and interface ugliness here.
2451 */
brw_page(int rw,struct page * page,kdev_t dev,int b[],int size)2452 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2453 {
2454 struct buffer_head *head, *bh;
2455
2456 if (!PageLocked(page))
2457 panic("brw_page: page not locked for I/O");
2458
2459 if (!page->buffers)
2460 create_empty_buffers(page, dev, size);
2461 head = bh = page->buffers;
2462
2463 /* Stage 1: lock all the buffers */
2464 do {
2465 lock_buffer(bh);
2466 bh->b_blocknr = *(b++);
2467 set_bit(BH_Mapped, &bh->b_state);
2468 set_buffer_async_io(bh);
2469 bh = bh->b_this_page;
2470 } while (bh != head);
2471
2472 /* Stage 2: start the IO */
2473 do {
2474 struct buffer_head *next = bh->b_this_page;
2475 submit_bh(rw, bh);
2476 bh = next;
2477 } while (bh != head);
2478 wakeup_page_waiters(page);
2479 return 0;
2480 }
2481
block_symlink(struct inode * inode,const char * symname,int len)2482 int block_symlink(struct inode *inode, const char *symname, int len)
2483 {
2484 struct address_space *mapping = inode->i_mapping;
2485 struct page *page = grab_cache_page(mapping, 0);
2486 int err = -ENOMEM;
2487 char *kaddr;
2488
2489 if (!page)
2490 goto fail;
2491 err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2492 if (err)
2493 goto fail_map;
2494 kaddr = page_address(page);
2495 memcpy(kaddr, symname, len-1);
2496 mapping->a_ops->commit_write(NULL, page, 0, len-1);
2497 /*
2498 * Notice that we are _not_ going to block here - end of page is
2499 * unmapped, so this will only try to map the rest of page, see
2500 * that it is unmapped (typically even will not look into inode -
2501 * ->i_size will be enough for everything) and zero it out.
2502 * OTOH it's obviously correct and should make the page up-to-date.
2503 */
2504 err = mapping->a_ops->readpage(NULL, page);
2505 wait_on_page(page);
2506 page_cache_release(page);
2507 if (err < 0)
2508 goto fail;
2509 mark_inode_dirty(inode);
2510 return 0;
2511 fail_map:
2512 UnlockPage(page);
2513 page_cache_release(page);
2514 fail:
2515 return err;
2516 }
2517
link_dev_buffers(struct page * page,struct buffer_head * head)2518 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2519 {
2520 struct buffer_head *bh, *tail;
2521
2522 bh = head;
2523 do {
2524 tail = bh;
2525 bh = bh->b_this_page;
2526 } while (bh);
2527 tail->b_this_page = head;
2528 page->buffers = head;
2529 page_cache_get(page);
2530 }
2531
2532 /*
2533 * Create the page-cache page that contains the requested block
2534 */
grow_dev_page(struct block_device * bdev,unsigned long index,int size)2535 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2536 {
2537 struct page * page;
2538 struct buffer_head *bh;
2539
2540 page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2541 if (!page)
2542 return NULL;
2543
2544 if (!PageLocked(page))
2545 BUG();
2546
2547 bh = page->buffers;
2548 if (bh) {
2549 if (bh->b_size == size)
2550 return page;
2551 if (!try_to_free_buffers(page, GFP_NOFS))
2552 goto failed;
2553 }
2554
2555 bh = create_buffers(page, size, 0);
2556 if (!bh)
2557 goto failed;
2558 link_dev_buffers(page, bh);
2559 return page;
2560
2561 failed:
2562 UnlockPage(page);
2563 page_cache_release(page);
2564 return NULL;
2565 }
2566
hash_page_buffers(struct page * page,kdev_t dev,int block,int size)2567 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2568 {
2569 struct buffer_head *head = page->buffers;
2570 struct buffer_head *bh = head;
2571 unsigned int uptodate;
2572
2573 uptodate = 1 << BH_Mapped;
2574 if (Page_Uptodate(page))
2575 uptodate |= 1 << BH_Uptodate;
2576
2577 write_lock(&hash_table_lock);
2578 do {
2579 if (!(bh->b_state & (1 << BH_Mapped))) {
2580 init_buffer(bh, NULL, NULL);
2581 bh->b_dev = dev;
2582 bh->b_blocknr = block;
2583 bh->b_state = uptodate;
2584 }
2585
2586 /* Insert the buffer into the hash lists if necessary */
2587 if (!bh->b_pprev)
2588 __insert_into_hash_list(bh);
2589
2590 block++;
2591 bh = bh->b_this_page;
2592 } while (bh != head);
2593 write_unlock(&hash_table_lock);
2594 }
2595
2596 /*
2597 * Try to increase the number of buffers available: the size argument
2598 * is used to determine what kind of buffers we want.
2599 */
grow_buffers(kdev_t dev,unsigned long block,int size)2600 static int grow_buffers(kdev_t dev, unsigned long block, int size)
2601 {
2602 struct page * page;
2603 struct block_device *bdev;
2604 unsigned long index;
2605 int sizebits;
2606
2607 /* Size must be multiple of hard sectorsize */
2608 if (size & (get_hardsect_size(dev)-1))
2609 BUG();
2610 /* Size must be within 512 bytes and PAGE_SIZE */
2611 if (size < 512 || size > PAGE_SIZE)
2612 BUG();
2613
2614 sizebits = -1;
2615 do {
2616 sizebits++;
2617 } while ((size << sizebits) < PAGE_SIZE);
2618
2619 index = block >> sizebits;
2620 block = index << sizebits;
2621
2622 bdev = bdget(kdev_t_to_nr(dev));
2623 if (!bdev) {
2624 printk("No block device for %s\n", kdevname(dev));
2625 BUG();
2626 }
2627
2628 /* Create a page with the proper size buffers.. */
2629 page = grow_dev_page(bdev, index, size);
2630
2631 /* This is "wrong" - talk to Al Viro */
2632 atomic_dec(&bdev->bd_count);
2633 if (!page)
2634 return 0;
2635
2636 /* Hash in the buffers on the hash list */
2637 hash_page_buffers(page, dev, block, size);
2638 UnlockPage(page);
2639 page_cache_release(page);
2640
2641 /* We hashed up this page, so increment buffermem */
2642 atomic_inc(&buffermem_pages);
2643 return 1;
2644 }
2645
2646 /*
2647 * The first time the VM inspects a page which has locked buffers, it
2648 * will just mark it as needing waiting upon on the scan of the page LRU.
2649 * BH_Wait_IO is used for this.
2650 *
2651 * The second time the VM visits the page, if it still has locked
2652 * buffers, it is time to start writing them out. (BH_Wait_IO was set).
2653 *
2654 * The third time the VM visits the page, if the I/O hasn't completed
2655 * then it's time to wait upon writeout. BH_Lock and BH_Launder are
2656 * used for this.
2657 *
2658 * There is also the case of buffers which were locked by someone else
2659 * - write(2) callers, bdflush, etc. There can be a huge number of these
2660 * and we don't want to just skip them all and fail the page allocation.
2661 * We want to be able to wait on these buffers as well.
2662 *
2663 * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2664 * underway against the buffer, doesn't matter who started it - we know
2665 * that the buffer will eventually come unlocked, and so it's safe to
2666 * wait on it.
2667 *
2668 * The caller holds the page lock and the caller will free this page
2669 * into current->local_page, so by waiting on the page's buffers the
2670 * caller is guaranteed to obtain this page.
2671 *
2672 * sync_page_buffers() will sort-of return true if all the buffers
2673 * against this page are freeable, so try_to_free_buffers() should
2674 * try to free the page's buffers a second time. This is a bit
2675 * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2676 */
sync_page_buffers(struct buffer_head * head)2677 static int sync_page_buffers(struct buffer_head *head)
2678 {
2679 struct buffer_head * bh = head;
2680 int tryagain = 1;
2681
2682 do {
2683 if (!buffer_dirty(bh) && !buffer_locked(bh))
2684 continue;
2685
2686 /* Don't start IO first time around.. */
2687 if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2688 tryagain = 0;
2689 continue;
2690 }
2691
2692 /* Second time through we start actively writing out.. */
2693 if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2694 if (unlikely(!buffer_launder(bh))) {
2695 tryagain = 0;
2696 continue;
2697 }
2698 wait_on_buffer(bh);
2699 tryagain = 1;
2700 continue;
2701 }
2702
2703 if (!atomic_set_buffer_clean(bh)) {
2704 unlock_buffer(bh);
2705 continue;
2706 }
2707
2708 __mark_buffer_clean(bh);
2709 get_bh(bh);
2710 bh->b_end_io = end_buffer_io_sync;
2711 submit_bh(WRITE, bh);
2712 tryagain = 0;
2713 } while ((bh = bh->b_this_page) != head);
2714
2715 return tryagain;
2716 }
2717
2718 /*
2719 * Can the buffer be thrown out?
2720 */
2721 #define BUFFER_BUSY_BITS ((1<<BH_Dirty) | (1<<BH_Lock))
2722 #define buffer_busy(bh) (atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2723
2724 /*
2725 * try_to_free_buffers() checks if all the buffers on this particular page
2726 * are unused, and free's the page if so.
2727 *
2728 * Wake up bdflush() if this fails - if we're running low on memory due
2729 * to dirty buffers, we need to flush them out as quickly as possible.
2730 *
2731 * NOTE: There are quite a number of ways that threads of control can
2732 * obtain a reference to a buffer head within a page. So we must
2733 * lock out all of these paths to cleanly toss the page.
2734 */
try_to_free_buffers(struct page * page,unsigned int gfp_mask)2735 int fastcall try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2736 {
2737 struct buffer_head * tmp, * bh = page->buffers;
2738
2739 cleaned_buffers_try_again:
2740 spin_lock(&lru_list_lock);
2741 write_lock(&hash_table_lock);
2742 tmp = bh;
2743 do {
2744 if (buffer_busy(tmp))
2745 goto busy_buffer_page;
2746 tmp = tmp->b_this_page;
2747 } while (tmp != bh);
2748
2749 spin_lock(&unused_list_lock);
2750 tmp = bh;
2751
2752 /* if this buffer was hashed, this page counts as buffermem */
2753 if (bh->b_pprev)
2754 atomic_dec(&buffermem_pages);
2755 do {
2756 struct buffer_head * p = tmp;
2757 tmp = tmp->b_this_page;
2758
2759 if (p->b_dev == B_FREE) BUG();
2760
2761 remove_inode_queue(p);
2762 __remove_from_queues(p);
2763 __put_unused_buffer_head(p);
2764 } while (tmp != bh);
2765 spin_unlock(&unused_list_lock);
2766
2767 /* Wake up anyone waiting for buffer heads */
2768 wake_up(&buffer_wait);
2769
2770 /* And free the page */
2771 page->buffers = NULL;
2772 page_cache_release(page);
2773 write_unlock(&hash_table_lock);
2774 spin_unlock(&lru_list_lock);
2775 return 1;
2776
2777 busy_buffer_page:
2778 /* Uhhuh, start writeback so that we don't end up with all dirty pages */
2779 write_unlock(&hash_table_lock);
2780 spin_unlock(&lru_list_lock);
2781 gfp_mask = pf_gfp_mask(gfp_mask);
2782 if (gfp_mask & __GFP_IO) {
2783 if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2784 if (sync_page_buffers(bh)) {
2785 /* no IO or waiting next time */
2786 gfp_mask = 0;
2787 goto cleaned_buffers_try_again;
2788 }
2789 }
2790 }
2791 if (balance_dirty_state() >= 0)
2792 wakeup_bdflush();
2793 return 0;
2794 }
2795 EXPORT_SYMBOL(try_to_free_buffers);
2796
2797 /* ================== Debugging =================== */
2798
show_buffers(void)2799 void show_buffers(void)
2800 {
2801 #ifdef CONFIG_SMP
2802 struct buffer_head * bh;
2803 int delalloc = 0, found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2804 int nlist;
2805 static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2806 #endif
2807
2808 printk("Buffer memory: %6dkB\n",
2809 atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2810
2811 printk("Cache memory: %6ldkB\n",
2812 (page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2813
2814 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2815 if (!spin_trylock(&lru_list_lock))
2816 return;
2817 for(nlist = 0; nlist < NR_LIST; nlist++) {
2818 delalloc = found = locked = dirty = used = lastused = 0;
2819 bh = lru_list[nlist];
2820 if(!bh) continue;
2821
2822 do {
2823 found++;
2824 if (buffer_locked(bh))
2825 locked++;
2826 if (buffer_dirty(bh))
2827 dirty++;
2828 if (buffer_delay(bh))
2829 delalloc++;
2830 if (atomic_read(&bh->b_count))
2831 used++, lastused = found;
2832 bh = bh->b_next_free;
2833 } while (bh != lru_list[nlist]);
2834 {
2835 int tmp = nr_buffers_type[nlist];
2836 if (found != tmp)
2837 printk("%9s: BUG -> found %d, reported %d\n",
2838 buf_types[nlist], found, tmp);
2839 }
2840 printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2841 "%d locked, %d dirty, %d delay\n",
2842 buf_types[nlist], found, size_buffers_type[nlist]>>(10-9),
2843 used, lastused, locked, dirty, delalloc);
2844 }
2845 spin_unlock(&lru_list_lock);
2846 #endif
2847 }
2848
2849 /* ===================== Init ======================= */
2850
2851 /*
2852 * allocate the hash table and init the free list
2853 * Use gfp() for the hash table to decrease TLB misses, use
2854 * SLAB cache for buffer heads.
2855 */
buffer_init(unsigned long mempages)2856 void __init buffer_init(unsigned long mempages)
2857 {
2858 int order, i;
2859 unsigned int nr_hash;
2860
2861 /* The buffer cache hash table is less important these days,
2862 * trim it a bit.
2863 */
2864 mempages >>= 14;
2865
2866 mempages *= sizeof(struct buffer_head *);
2867
2868 for (order = 0; (1 << order) < mempages; order++)
2869 ;
2870
2871 /* try to allocate something until we get it or we're asking
2872 for something that is really too small */
2873
2874 do {
2875 unsigned long tmp;
2876
2877 nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2878 bh_hash_mask = (nr_hash - 1);
2879
2880 tmp = nr_hash;
2881 bh_hash_shift = 0;
2882 while((tmp >>= 1UL) != 0UL)
2883 bh_hash_shift++;
2884
2885 hash_table = (struct buffer_head **)
2886 __get_free_pages(GFP_ATOMIC, order);
2887 } while (hash_table == NULL && --order > 0);
2888 printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2889 nr_hash, order, (PAGE_SIZE << order));
2890
2891 if (!hash_table)
2892 panic("Failed to allocate buffer hash table\n");
2893
2894 /* Setup hash chains. */
2895 for(i = 0; i < nr_hash; i++)
2896 hash_table[i] = NULL;
2897
2898 /* Setup lru lists. */
2899 for(i = 0; i < NR_LIST; i++)
2900 lru_list[i] = NULL;
2901
2902 }
2903
2904
2905 /* ====================== bdflush support =================== */
2906
2907 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2908 * response to dirty buffers. Once this process is activated, we write back
2909 * a limited number of buffers to the disks and then go back to sleep again.
2910 */
2911
2912 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2913
wakeup_bdflush(void)2914 void wakeup_bdflush(void)
2915 {
2916 wake_up_interruptible(&bdflush_wait);
2917 }
2918
wakeup_kupdate(void)2919 void wakeup_kupdate(void)
2920 {
2921 if (waitqueue_active(&kupdate_wait))
2922 wake_up(&kupdate_wait);
2923 }
2924
2925 /*
2926 * Here we attempt to write back old buffers. We also try to flush inodes
2927 * and supers as well, since this function is essentially "update", and
2928 * otherwise there would be no way of ensuring that these quantities ever
2929 * get written back. Ideally, we would have a timestamp on the inodes
2930 * and superblocks so that we could write back only the old ones as well
2931 */
2932
sync_old_buffers(void)2933 static int sync_old_buffers(void)
2934 {
2935 lock_kernel();
2936 sync_unlocked_inodes();
2937 sync_supers(0, 0);
2938 unlock_kernel();
2939
2940 for (;;) {
2941 struct buffer_head *bh;
2942
2943 spin_lock(&lru_list_lock);
2944 bh = lru_list[BUF_DIRTY];
2945 if (!bh)
2946 break;
2947 if (time_before(jiffies, bh->b_flushtime) && !laptop_mode)
2948 break;
2949 if (write_some_buffers(NODEV))
2950 continue;
2951 return 0;
2952 }
2953 spin_unlock(&lru_list_lock);
2954 return 0;
2955 }
2956
block_sync_page(struct page * page)2957 int block_sync_page(struct page *page)
2958 {
2959 run_task_queue(&tq_disk);
2960 return 0;
2961 }
2962
2963 /* This is the interface to bdflush. As we get more sophisticated, we can
2964 * pass tuning parameters to this "process", to adjust how it behaves.
2965 * We would want to verify each parameter, however, to make sure that it
2966 * is reasonable. */
2967
sys_bdflush(int func,long data)2968 asmlinkage long sys_bdflush(int func, long data)
2969 {
2970 if (!capable(CAP_SYS_ADMIN))
2971 return -EPERM;
2972
2973 if (func == 1) {
2974 /* do_exit directly and let kupdate to do its work alone. */
2975 do_exit(0);
2976 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2977 a syscall that doesn't care about the current mm context. */
2978 int error;
2979 struct mm_struct *user_mm;
2980
2981 /*
2982 * bdflush will spend all of it's time in kernel-space,
2983 * without touching user-space, so we can switch it into
2984 * 'lazy TLB mode' to reduce the cost of context-switches
2985 * to and from bdflush.
2986 */
2987 user_mm = start_lazy_tlb();
2988 error = sync_old_buffers();
2989 end_lazy_tlb(user_mm);
2990 return error;
2991 #endif
2992 }
2993
2994 /* Basically func 1 means read param 1, 2 means write param 1, etc */
2995 if (func >= 2) {
2996 int i = (func-2) >> 1;
2997 if (i >= 0 && i < N_PARAM) {
2998 if ((func & 1) == 0)
2999 return put_user(bdf_prm.data[i], (int*)data);
3000
3001 if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
3002 bdf_prm.data[i] = data;
3003 return 0;
3004 }
3005 }
3006 return -EINVAL;
3007 }
3008
3009 /* Having func 0 used to launch the actual bdflush and then never
3010 * return (unless explicitly killed). We return zero here to
3011 * remain semi-compatible with present update(8) programs.
3012 */
3013 return 0;
3014 }
3015
3016 /*
3017 * This is the actual bdflush daemon itself. It used to be started from
3018 * the syscall above, but now we launch it ourselves internally with
3019 * kernel_thread(...) directly after the first thread in init/main.c
3020 */
bdflush(void * startup)3021 int bdflush(void *startup)
3022 {
3023 struct task_struct *tsk = current;
3024
3025 /*
3026 * We have a bare-bones task_struct, and really should fill
3027 * in a few more things so "top" and /proc/2/{exe,root,cwd}
3028 * display semi-sane things. Not real crucial though...
3029 */
3030
3031 tsk->session = 1;
3032 tsk->pgrp = 1;
3033 strcpy(tsk->comm, "bdflush");
3034
3035 /* avoid getting signals */
3036 spin_lock_irq(&tsk->sigmask_lock);
3037 flush_signals(tsk);
3038 sigfillset(&tsk->blocked);
3039 recalc_sigpending(tsk);
3040 spin_unlock_irq(&tsk->sigmask_lock);
3041
3042 complete((struct completion *)startup);
3043
3044 /*
3045 * FIXME: The ndirty logic here is wrong. It's supposed to
3046 * send bdflush back to sleep after writing ndirty buffers.
3047 * In fact, the test is wrong so bdflush will in fact
3048 * sleep when bdflush_stop() returns true.
3049 *
3050 * FIXME: If it proves useful to implement ndirty properly,
3051 * then perhaps the value of ndirty should be scaled by the
3052 * amount of memory in the machine.
3053 */
3054 for (;;) {
3055 int ndirty = bdf_prm.b_un.ndirty;
3056
3057 CHECK_EMERGENCY_SYNC
3058
3059 while (ndirty > 0) {
3060 spin_lock(&lru_list_lock);
3061 if (!write_some_buffers(NODEV))
3062 break;
3063 ndirty -= NRSYNC;
3064 }
3065 if (ndirty > 0 || bdflush_stop())
3066 interruptible_sleep_on(&bdflush_wait);
3067 }
3068 }
3069
3070 /*
3071 * This is the kernel update daemon. It was used to live in userspace
3072 * but since it's need to run safely we want it unkillable by mistake.
3073 * You don't need to change your userspace configuration since
3074 * the userspace `update` will do_exit(0) at the first sys_bdflush().
3075 */
kupdate(void * startup)3076 int kupdate(void *startup)
3077 {
3078 struct task_struct * tsk = current;
3079 int interval;
3080
3081 tsk->session = 1;
3082 tsk->pgrp = 1;
3083 strcpy(tsk->comm, "kupdated");
3084
3085 /* sigstop and sigcont will stop and wakeup kupdate */
3086 spin_lock_irq(&tsk->sigmask_lock);
3087 sigfillset(&tsk->blocked);
3088 siginitsetinv(¤t->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3089 recalc_sigpending(tsk);
3090 spin_unlock_irq(&tsk->sigmask_lock);
3091
3092 complete((struct completion *)startup);
3093
3094 for (;;) {
3095 DECLARE_WAITQUEUE(wait, tsk);
3096
3097 add_wait_queue(&kupdate_wait, &wait);
3098
3099 /* update interval */
3100 interval = bdf_prm.b_un.interval;
3101 if (interval) {
3102 tsk->state = TASK_INTERRUPTIBLE;
3103 schedule_timeout(interval);
3104 } else {
3105 tsk->state = TASK_STOPPED;
3106 schedule(); /* wait for SIGCONT */
3107 }
3108 remove_wait_queue(&kupdate_wait, &wait);
3109 /* check for sigstop */
3110 if (signal_pending(tsk)) {
3111 int sig, stopped = 0;
3112 struct siginfo info;
3113
3114 spin_lock_irq(&tsk->sigmask_lock);
3115 sig = dequeue_signal(¤t->blocked, &info);
3116 if (sig == SIGSTOP)
3117 stopped = 1;
3118 spin_unlock_irq(&tsk->sigmask_lock);
3119 if (stopped) {
3120 tsk->state = TASK_STOPPED;
3121 schedule(); /* wait for SIGCONT */
3122 }
3123 }
3124 #ifdef DEBUG
3125 printk(KERN_DEBUG "kupdate() activated...\n");
3126 #endif
3127 sync_old_buffers();
3128 if (laptop_mode)
3129 fsync_dev(NODEV);
3130 run_task_queue(&tq_disk);
3131 }
3132 }
3133
bdflush_init(void)3134 static int __init bdflush_init(void)
3135 {
3136 static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3137
3138 kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3139 wait_for_completion(&startup);
3140 kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3141 wait_for_completion(&startup);
3142 return 0;
3143 }
3144
3145 module_init(bdflush_init)
3146
3147