1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992  Linus Torvalds
5  */
6 
7 /*
8  *  'buffer.c' implements the buffer-cache functions. Race-conditions have
9  * been avoided by NEVER letting an interrupt change a buffer (except for the
10  * data, of course), but instead letting the caller do it.
11  */
12 
13 /* Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 */
14 
15 /* Removed a lot of unnecessary code and simplified things now that
16  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
17  */
18 
19 /* Speed up hash, lru, and free list operations.  Use gfp() for allocating
20  * hash table, use SLAB cache for buffer heads. -DaveM
21  */
22 
23 /* Added 32k buffer block sizes - these are required older ARM systems.
24  * - RMK
25  */
26 
27 /* Thread it... -DaveM */
28 
29 /* async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */
30 
31 #include <linux/config.h>
32 #include <linux/sched.h>
33 #include <linux/fs.h>
34 #include <linux/slab.h>
35 #include <linux/locks.h>
36 #include <linux/errno.h>
37 #include <linux/swap.h>
38 #include <linux/swapctl.h>
39 #include <linux/smp_lock.h>
40 #include <linux/vmalloc.h>
41 #include <linux/blkdev.h>
42 #include <linux/sysrq.h>
43 #include <linux/file.h>
44 #include <linux/init.h>
45 #include <linux/quotaops.h>
46 #include <linux/iobuf.h>
47 #include <linux/highmem.h>
48 #include <linux/module.h>
49 #include <linux/completion.h>
50 
51 #include <asm/uaccess.h>
52 #include <asm/io.h>
53 #include <asm/bitops.h>
54 #include <asm/mmu_context.h>
55 
56 #define NR_RESERVED (10*MAX_BUF_PER_PAGE)
57 #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this
58 					     number of unused buffer heads */
59 
60 /* Anti-deadlock ordering:
61  *	lru_list_lock > hash_table_lock > unused_list_lock
62  */
63 
64 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers)
65 
66 /*
67  * Hash table gook..
68  */
69 static unsigned int bh_hash_mask;
70 static unsigned int bh_hash_shift;
71 static struct buffer_head **hash_table;
72 static rwlock_t hash_table_lock = RW_LOCK_UNLOCKED;
73 
74 static struct buffer_head *lru_list[NR_LIST];
75 
76 static spinlock_cacheline_t lru_list_lock_cacheline = {SPIN_LOCK_UNLOCKED};
77 #define lru_list_lock  lru_list_lock_cacheline.lock
78 
79 static int nr_buffers_type[NR_LIST];
80 static unsigned long size_buffers_type[NR_LIST];
81 
82 static struct buffer_head * unused_list;
83 static int nr_unused_buffer_heads;
84 static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED;
85 static DECLARE_WAIT_QUEUE_HEAD(buffer_wait);
86 
87 static int grow_buffers(kdev_t dev, unsigned long block, int size);
88 static int osync_buffers_list(struct list_head *);
89 static void __refile_buffer(struct buffer_head *);
90 
91 /*
92  * A global sysctl-controlled flag which puts the machine into "laptop mode"
93  */
94 int laptop_mode;
95 
96 static DECLARE_WAIT_QUEUE_HEAD(kupdate_wait);
97 
98 /* This is used by some architectures to estimate available memory. */
99 atomic_t buffermem_pages = ATOMIC_INIT(0);
100 
101 /* Here is the parameter block for the bdflush process. If you add or
102  * remove any of the parameters, make sure to update kernel/sysctl.c
103  * and the documentation at linux/Documentation/sysctl/vm.txt.
104  */
105 
106 #define N_PARAM 9
107 
108 /* The dummy values in this structure are left in there for compatibility
109  * with old programs that play with the /proc entries.
110  */
111 union bdflush_param {
112 	struct {
113 		int nfract;	/* Percentage of buffer cache dirty to
114 				   activate bdflush */
115 		int ndirty;	/* Maximum number of dirty blocks to write out per
116 				   wake-cycle */
117 		int dummy2;	/* old "nrefill" */
118 		int dummy3;	/* unused */
119 		int interval;	/* jiffies delay between kupdate flushes */
120 		int age_buffer;	/* Time for normal buffer to age before we flush it */
121 		int nfract_sync;/* Percentage of buffer cache dirty to
122 				   activate bdflush synchronously */
123 		int nfract_stop_bdflush; /* Percetange of buffer cache dirty to stop bdflush */
124 		int dummy5;	/* unused */
125 	} b_un;
126 	unsigned int data[N_PARAM];
127 } bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}};
128 
129 /* These are the min and max parameter values that we will allow to be assigned */
130 int bdflush_min[N_PARAM] = {  0,  1,    0,   0,  0,   1*HZ,   0, 0, 0};
131 int bdflush_max[N_PARAM] = {100,50000, 20000, 20000,10000*HZ, 10000*HZ, 100, 100, 0};
132 
write_buffer_delay(struct buffer_head * bh)133 static inline int write_buffer_delay(struct buffer_head *bh)
134 {
135 	struct page *page = bh->b_page;
136 
137 	if (!TryLockPage(page)) {
138 		spin_unlock(&lru_list_lock);
139 		unlock_buffer(bh);
140 		page->mapping->a_ops->writepage(page);
141 		return 1;
142 	}
143 
144 	return 0;
145 }
146 
write_buffer(struct buffer_head * bh)147 static inline void write_buffer(struct buffer_head *bh)
148 {
149 	if (buffer_delay(bh)) {
150 		struct page *page = bh->b_page;
151 
152 		lock_page(page);
153 		if (buffer_delay(bh)) {
154 			page->mapping->a_ops->writepage(page);
155 			return;
156 		}
157 		unlock_page(page);
158 	}
159 
160 	ll_rw_block(WRITE, 1, &bh);
161 }
162 
unlock_buffer(struct buffer_head * bh)163 void fastcall unlock_buffer(struct buffer_head *bh)
164 {
165 	clear_bit(BH_Wait_IO, &bh->b_state);
166 	clear_bit(BH_Launder, &bh->b_state);
167 	/*
168 	 * When a locked buffer is visible to the I/O layer BH_Launder
169 	 * is set. This means before unlocking we must clear BH_Launder,
170 	 * mb() on alpha and then clear BH_Lock, so no reader can see
171 	 * BH_Launder set on an unlocked buffer and then risk to deadlock.
172 	 */
173 	smp_mb__after_clear_bit();
174 	clear_bit(BH_Lock, &bh->b_state);
175 	smp_mb__after_clear_bit();
176 	if (waitqueue_active(&bh->b_wait))
177 		wake_up(&bh->b_wait);
178 }
179 
180 /*
181  * Note that the real wait_on_buffer() is an inline function that checks
182  * that the buffer is locked before calling this, so that unnecessary disk
183  * unplugging does not occur.
184  */
__wait_on_buffer(struct buffer_head * bh)185 void __wait_on_buffer(struct buffer_head * bh)
186 {
187 	struct task_struct *tsk = current;
188 	DECLARE_WAITQUEUE(wait, tsk);
189 
190 	get_bh(bh);
191 	add_wait_queue(&bh->b_wait, &wait);
192 	do {
193 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
194 		if (!buffer_locked(bh))
195 			break;
196 		/*
197 		 * We must read tq_disk in TQ_ACTIVE after the
198 		 * add_wait_queue effect is visible to other cpus.
199 		 * We could unplug some line above it wouldn't matter
200 		 * but we can't do that right after add_wait_queue
201 		 * without an smp_mb() in between because spin_unlock
202 		 * has inclusive semantics.
203 		 * Doing it here is the most efficient place so we
204 		 * don't do a suprious unplug if we get a racy
205 		 * wakeup that make buffer_locked to return 0, and
206 		 * doing it here avoids an explicit smp_mb() we
207 		 * rely on the implicit one in set_task_state.
208 		 */
209 		run_task_queue(&tq_disk);
210 		schedule();
211 	} while (buffer_locked(bh));
212 	tsk->state = TASK_RUNNING;
213 	remove_wait_queue(&bh->b_wait, &wait);
214 	put_bh(bh);
215 }
216 
217 /*
218  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
219  * unlock the buffer. This is what ll_rw_block uses too.
220  */
end_buffer_io_sync(struct buffer_head * bh,int uptodate)221 void end_buffer_io_sync(struct buffer_head *bh, int uptodate)
222 {
223 	mark_buffer_uptodate(bh, uptodate);
224 	unlock_buffer(bh);
225 	put_bh(bh);
226 }
227 
228 /*
229  * The buffers have been marked clean and locked.  Just submit the dang
230  * things..
231  */
write_locked_buffers(struct buffer_head ** array,unsigned int count)232 static void write_locked_buffers(struct buffer_head **array, unsigned int count)
233 {
234 	do {
235 		struct buffer_head * bh = *array++;
236 		bh->b_end_io = end_buffer_io_sync;
237 		submit_bh(WRITE, bh);
238 	} while (--count);
239 }
240 
241 /*
242  * Write some buffers from the head of the dirty queue.
243  *
244  * This must be called with the LRU lock held, and will
245  * return without it!
246  */
247 #define NRSYNC (32)
write_some_buffers(kdev_t dev)248 static int write_some_buffers(kdev_t dev)
249 {
250 	struct buffer_head *next;
251 	struct buffer_head *array[NRSYNC];
252 	unsigned int count;
253 	int nr;
254 
255 	next = lru_list[BUF_DIRTY];
256 	nr = nr_buffers_type[BUF_DIRTY];
257 	count = 0;
258 	while (next && --nr >= 0) {
259 		struct buffer_head * bh = next;
260 		next = bh->b_next_free;
261 
262 		if (dev != NODEV && bh->b_dev != dev)
263 			continue;
264 		if (test_and_set_bit(BH_Lock, &bh->b_state))
265 			continue;
266 		if (buffer_delay(bh)) {
267 			if (write_buffer_delay(bh)) {
268 				if (count)
269 					write_locked_buffers(array, count);
270 				return -EAGAIN;
271 			}
272 		} else if (atomic_set_buffer_clean(bh)) {
273 			__refile_buffer(bh);
274 			get_bh(bh);
275 			array[count++] = bh;
276 			if (count < NRSYNC)
277 				continue;
278 
279 			spin_unlock(&lru_list_lock);
280 			write_locked_buffers(array, count);
281 			return -EAGAIN;
282 		}
283 		unlock_buffer(bh);
284 		__refile_buffer(bh);
285 	}
286 	spin_unlock(&lru_list_lock);
287 
288 	if (count)
289 		write_locked_buffers(array, count);
290 	return 0;
291 }
292 
293 /*
294  * Write out all buffers on the dirty list.
295  */
write_unlocked_buffers(kdev_t dev)296 static void write_unlocked_buffers(kdev_t dev)
297 {
298 	do
299 		spin_lock(&lru_list_lock);
300 	while (write_some_buffers(dev));
301 }
302 
303 /*
304  * Wait for a buffer on the proper list.
305  *
306  * This must be called with the LRU lock held, and
307  * will return with it released.
308  */
wait_for_buffers(kdev_t dev,int index,int refile)309 static int wait_for_buffers(kdev_t dev, int index, int refile)
310 {
311 	struct buffer_head * next;
312 	int nr;
313 
314 	next = lru_list[index];
315 	nr = nr_buffers_type[index];
316 	while (next && --nr >= 0) {
317 		struct buffer_head *bh = next;
318 		next = bh->b_next_free;
319 
320 		if (!buffer_locked(bh)) {
321 			if (refile)
322 				__refile_buffer(bh);
323 			continue;
324 		}
325 		if (dev != NODEV && bh->b_dev != dev)
326 			continue;
327 
328 		get_bh(bh);
329 		spin_unlock(&lru_list_lock);
330 		wait_on_buffer (bh);
331 		put_bh(bh);
332 		return -EAGAIN;
333 	}
334 	spin_unlock(&lru_list_lock);
335 	return 0;
336 }
337 
wait_for_locked_buffers(kdev_t dev,int index,int refile)338 static int wait_for_locked_buffers(kdev_t dev, int index, int refile)
339 {
340 	do {
341 		spin_lock(&lru_list_lock);
342 	} while (wait_for_buffers(dev, index, refile));
343 	return 0;
344 }
345 
346 /* Call sync_buffers with wait!=0 to ensure that the call does not
347  * return until all buffer writes have completed.  Sync() may return
348  * before the writes have finished; fsync() may not.
349  */
350 
351 /* Godamity-damn.  Some buffers (bitmaps for filesystems)
352  * spontaneously dirty themselves without ever brelse being called.
353  * We will ultimately want to put these in a separate list, but for
354  * now we search all of the lists for dirty buffers.
355  */
sync_buffers(kdev_t dev,int wait)356 int sync_buffers(kdev_t dev, int wait)
357 {
358 	int err = 0;
359 
360 	/* One pass for no-wait, three for wait:
361 	 * 0) write out all dirty, unlocked buffers;
362 	 * 1) wait for all dirty locked buffers;
363 	 * 2) write out all dirty, unlocked buffers;
364 	 * 2) wait for completion by waiting for all buffers to unlock.
365 	 */
366 	write_unlocked_buffers(dev);
367 	if (wait) {
368 		err = wait_for_locked_buffers(dev, BUF_DIRTY, 0);
369 		write_unlocked_buffers(dev);
370 		err |= wait_for_locked_buffers(dev, BUF_LOCKED, 1);
371 	}
372 	return err;
373 }
374 EXPORT_SYMBOL(sync_buffers);
375 
fsync_super(struct super_block * sb)376 int fsync_super(struct super_block *sb)
377 {
378 	kdev_t dev = sb->s_dev;
379 	sync_buffers(dev, 0);
380 
381 	lock_kernel();
382 	sync_inodes_sb(sb);
383 	DQUOT_SYNC_SB(sb);
384 	lock_super(sb);
385 	if (sb->s_dirt && sb->s_op && sb->s_op->write_super)
386 		sb->s_op->write_super(sb);
387 	unlock_super(sb);
388 	if (sb->s_op && sb->s_op->sync_fs)
389 		sb->s_op->sync_fs(sb);
390 	unlock_kernel();
391 
392 	return sync_buffers(dev, 1);
393 }
394 
fsync_no_super(kdev_t dev)395 int fsync_no_super(kdev_t dev)
396 {
397 	sync_buffers(dev, 0);
398 	return sync_buffers(dev, 1);
399 }
400 
fsync_dev(kdev_t dev)401 int fsync_dev(kdev_t dev)
402 {
403 	sync_buffers(dev, 0);
404 
405 	lock_kernel();
406 	sync_inodes(dev);
407 	DQUOT_SYNC_DEV(dev);
408 	sync_supers(dev, 1);
409 	unlock_kernel();
410 
411 	return sync_buffers(dev, 1);
412 }
413 
414 /*
415  * There's no real reason to pretend we should
416  * ever do anything differently
417  */
sync_dev(kdev_t dev)418 void sync_dev(kdev_t dev)
419 {
420 	fsync_dev(dev);
421 }
422 
sys_sync(void)423 asmlinkage long sys_sync(void)
424 {
425 	fsync_dev(0);
426 	return 0;
427 }
428 
429 /*
430  *	filp may be NULL if called via the msync of a vma.
431  */
432 
file_fsync(struct file * filp,struct dentry * dentry,int datasync)433 int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
434 {
435 	struct inode * inode = dentry->d_inode;
436 	struct super_block * sb;
437 	kdev_t dev;
438 	int ret;
439 
440 	lock_kernel();
441 	/* sync the inode to buffers */
442 	write_inode_now(inode, 0);
443 
444 	/* sync the superblock to buffers */
445 	sb = inode->i_sb;
446 	lock_super(sb);
447 	if (sb->s_op && sb->s_op->write_super)
448 		sb->s_op->write_super(sb);
449 	unlock_super(sb);
450 
451 	/* .. finally sync the buffers to disk */
452 	dev = inode->i_dev;
453 	ret = sync_buffers(dev, 1);
454 	unlock_kernel();
455 	return ret;
456 }
457 
sys_fsync(unsigned int fd)458 asmlinkage long sys_fsync(unsigned int fd)
459 {
460 	struct file * file;
461 	struct dentry * dentry;
462 	struct inode * inode;
463 	int ret, err;
464 
465 	ret = -EBADF;
466 	file = fget(fd);
467 	if (!file)
468 		goto out;
469 
470 	dentry = file->f_dentry;
471 	inode = dentry->d_inode;
472 
473 	ret = -EINVAL;
474 	if (!file->f_op || !file->f_op->fsync) {
475 		/* Why?  We can still call filemap_fdatasync */
476 		goto out_putf;
477 	}
478 
479 	/* We need to protect against concurrent writers.. */
480 	down(&inode->i_sem);
481 	ret = filemap_fdatasync(inode->i_mapping);
482 	err = file->f_op->fsync(file, dentry, 0);
483 	if (err && !ret)
484 		ret = err;
485 	err = filemap_fdatawait(inode->i_mapping);
486 	if (err && !ret)
487 		ret = err;
488 	up(&inode->i_sem);
489 
490 out_putf:
491 	fput(file);
492 out:
493 	return ret;
494 }
495 
do_fdatasync(struct file * file)496 int do_fdatasync(struct file *file)
497 {
498 	int ret, err;
499 	struct dentry *dentry;
500 	struct inode *inode;
501 
502 	if (unlikely(!file->f_op || !file->f_op->fsync))
503 		return -EINVAL;
504 
505 	dentry = file->f_dentry;
506 	inode = dentry->d_inode;
507 
508 	ret = filemap_fdatasync(inode->i_mapping);
509 	err = file->f_op->fsync(file, dentry, 1);
510 	if (err && !ret)
511 		ret = err;
512 	err = filemap_fdatawait(inode->i_mapping);
513 	if (err && !ret)
514 		ret = err;
515 	return ret;
516 }
517 
sys_fdatasync(unsigned int fd)518 asmlinkage long sys_fdatasync(unsigned int fd)
519 {
520 	struct file * file;
521 	struct inode *inode;
522 	int ret;
523 
524 	ret = -EBADF;
525 	file = fget(fd);
526 	if (!file)
527 		goto out;
528 
529 	inode = file->f_dentry->d_inode;
530 	down(&inode->i_sem);
531 	ret = do_fdatasync(file);
532 	up(&inode->i_sem);
533 
534 	fput(file);
535 out:
536 	return ret;
537 }
538 
539 /* After several hours of tedious analysis, the following hash
540  * function won.  Do not mess with it... -DaveM
541  */
542 #define _hashfn(dev,block)	\
543 	((((dev)<<(bh_hash_shift - 6)) ^ ((dev)<<(bh_hash_shift - 9))) ^ \
544 	 (((block)<<(bh_hash_shift - 6)) ^ ((block) >> 13) ^ \
545 	  ((block) << (bh_hash_shift - 12))))
546 #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)]
547 
__insert_into_hash_list(struct buffer_head * bh)548 static inline void __insert_into_hash_list(struct buffer_head *bh)
549 {
550 	struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr);
551 	struct buffer_head *next = *head;
552 
553 	*head = bh;
554 	bh->b_pprev = head;
555 	bh->b_next = next;
556 	if (next != NULL)
557 		next->b_pprev = &bh->b_next;
558 }
559 
__hash_unlink(struct buffer_head * bh)560 static __inline__ void __hash_unlink(struct buffer_head *bh)
561 {
562 	struct buffer_head **pprev = bh->b_pprev;
563 	if (pprev) {
564 		struct buffer_head *next = bh->b_next;
565 		if (next)
566 			next->b_pprev = pprev;
567 		*pprev = next;
568 		bh->b_pprev = NULL;
569 	}
570 }
571 
__insert_into_lru_list(struct buffer_head * bh,int blist)572 static void __insert_into_lru_list(struct buffer_head * bh, int blist)
573 {
574 	struct buffer_head **bhp = &lru_list[blist];
575 
576 	if (bh->b_prev_free || bh->b_next_free) BUG();
577 
578 	if(!*bhp) {
579 		*bhp = bh;
580 		bh->b_prev_free = bh;
581 	}
582 	bh->b_next_free = *bhp;
583 	bh->b_prev_free = (*bhp)->b_prev_free;
584 	(*bhp)->b_prev_free->b_next_free = bh;
585 	(*bhp)->b_prev_free = bh;
586 	nr_buffers_type[blist]++;
587 	size_buffers_type[blist] += bh->b_size >> 9;
588 }
589 
__remove_from_lru_list(struct buffer_head * bh)590 static void __remove_from_lru_list(struct buffer_head * bh)
591 {
592 	struct buffer_head *next = bh->b_next_free;
593 	if (next) {
594 		struct buffer_head *prev = bh->b_prev_free;
595 		int blist = bh->b_list;
596 
597 		prev->b_next_free = next;
598 		next->b_prev_free = prev;
599 		if (lru_list[blist] == bh) {
600 			if (next == bh)
601 				next = NULL;
602 			lru_list[blist] = next;
603 		}
604 		bh->b_next_free = NULL;
605 		bh->b_prev_free = NULL;
606 		nr_buffers_type[blist]--;
607 		size_buffers_type[blist] -= bh->b_size >> 9;
608 	}
609 }
610 
611 /* must be called with both the hash_table_lock and the lru_list_lock
612    held */
__remove_from_queues(struct buffer_head * bh)613 static void __remove_from_queues(struct buffer_head *bh)
614 {
615 	__hash_unlink(bh);
616 	__remove_from_lru_list(bh);
617 }
618 
remove_from_queues(struct buffer_head * bh)619 static void remove_from_queues(struct buffer_head *bh)
620 {
621 	spin_lock(&lru_list_lock);
622 	write_lock(&hash_table_lock);
623 	__remove_from_queues(bh);
624 	write_unlock(&hash_table_lock);
625 	spin_unlock(&lru_list_lock);
626 }
627 
get_hash_table(kdev_t dev,int block,int size)628 struct buffer_head * get_hash_table(kdev_t dev, int block, int size)
629 {
630 	struct buffer_head *bh, **p = &hash(dev, block);
631 
632 	read_lock(&hash_table_lock);
633 
634 	for (;;) {
635 		bh = *p;
636 		if (!bh)
637 			break;
638 		p = &bh->b_next;
639 		if (bh->b_blocknr != block)
640 			continue;
641 		if (bh->b_size != size)
642 			continue;
643 		if (bh->b_dev != dev)
644 			continue;
645 		get_bh(bh);
646 		break;
647 	}
648 
649 	read_unlock(&hash_table_lock);
650 	return bh;
651 }
652 
buffer_insert_list(struct buffer_head * bh,struct list_head * list)653 void fastcall buffer_insert_list(struct buffer_head *bh, struct list_head *list)
654 {
655 	spin_lock(&lru_list_lock);
656 	if (buffer_attached(bh))
657 		list_del(&bh->b_inode_buffers);
658 	set_buffer_attached(bh);
659 	list_add_tail(&bh->b_inode_buffers, list);
660 	spin_unlock(&lru_list_lock);
661 }
662 
663 /*
664  * The caller must have the lru_list lock before calling the
665  * remove_inode_queue functions.
666  */
__remove_inode_queue(struct buffer_head * bh)667 static void __remove_inode_queue(struct buffer_head *bh)
668 {
669 	list_del(&bh->b_inode_buffers);
670 	clear_buffer_attached(bh);
671 }
672 
remove_inode_queue(struct buffer_head * bh)673 static inline void remove_inode_queue(struct buffer_head *bh)
674 {
675 	if (buffer_attached(bh))
676 		__remove_inode_queue(bh);
677 }
678 
inode_has_buffers(struct inode * inode)679 int inode_has_buffers(struct inode *inode)
680 {
681 	int ret;
682 
683 	spin_lock(&lru_list_lock);
684 	ret = !list_empty(&inode->i_dirty_buffers) || !list_empty(&inode->i_dirty_data_buffers);
685 	spin_unlock(&lru_list_lock);
686 
687 	return ret;
688 }
689 
690 /* If invalidate_buffers() will trash dirty buffers, it means some kind
691    of fs corruption is going on. Trashing dirty data always imply losing
692    information that was supposed to be just stored on the physical layer
693    by the user.
694 
695    Thus invalidate_buffers in general usage is not allwowed to trash
696    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
697    be preserved.  These buffers are simply skipped.
698 
699    We also skip buffers which are still in use.  For example this can
700    happen if a userspace program is reading the block device.
701 
702    NOTE: In the case where the user removed a removable-media-disk even if
703    there's still dirty data not synced on disk (due a bug in the device driver
704    or due an error of the user), by not destroying the dirty buffers we could
705    generate corruption also on the next media inserted, thus a parameter is
706    necessary to handle this case in the most safe way possible (trying
707    to not corrupt also the new disk inserted with the data belonging to
708    the old now corrupted disk). Also for the ramdisk the natural thing
709    to do in order to release the ramdisk memory is to destroy dirty buffers.
710 
711    These are two special cases. Normal usage imply the device driver
712    to issue a sync on the device (without waiting I/O completion) and
713    then an invalidate_buffers call that doesn't trash dirty buffers.
714 
715    For handling cache coherency with the blkdev pagecache the 'update' case
716    is been introduced. It is needed to re-read from disk any pinned
717    buffer. NOTE: re-reading from disk is destructive so we can do it only
718    when we assume nobody is changing the buffercache under our I/O and when
719    we think the disk contains more recent information than the buffercache.
720    The update == 1 pass marks the buffers we need to update, the update == 2
721    pass does the actual I/O. */
invalidate_bdev(struct block_device * bdev,int destroy_dirty_buffers)722 void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers)
723 {
724 	int i, nlist, slept;
725 	struct buffer_head * bh, * bh_next;
726 	kdev_t dev = to_kdev_t(bdev->bd_dev);	/* will become bdev */
727 
728  retry:
729 	slept = 0;
730 	spin_lock(&lru_list_lock);
731 	for(nlist = 0; nlist < NR_LIST; nlist++) {
732 		bh = lru_list[nlist];
733 		if (!bh)
734 			continue;
735 		for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) {
736 			bh_next = bh->b_next_free;
737 
738 			/* Another device? */
739 			if (bh->b_dev != dev)
740 				continue;
741 			/* Not hashed? */
742 			if (!bh->b_pprev)
743 				continue;
744 			if (buffer_locked(bh)) {
745 				get_bh(bh);
746 				spin_unlock(&lru_list_lock);
747 				wait_on_buffer(bh);
748 				slept = 1;
749 				spin_lock(&lru_list_lock);
750 				put_bh(bh);
751 			}
752 
753 			write_lock(&hash_table_lock);
754 			/* All buffers in the lru lists are mapped */
755 			if (!buffer_mapped(bh))
756 				BUG();
757 			if (buffer_dirty(bh) && destroy_dirty_buffers)
758 				printk("invalidate: dirty buffer\n");
759 			if (!atomic_read(&bh->b_count)) {
760 				if (destroy_dirty_buffers || !buffer_dirty(bh)) {
761 					remove_inode_queue(bh);
762 				}
763 			} else if (!bdev->bd_openers)
764 				printk("invalidate: busy buffer\n");
765 
766 			write_unlock(&hash_table_lock);
767 			if (slept)
768 				goto out;
769 		}
770 	}
771 out:
772 	spin_unlock(&lru_list_lock);
773 	if (slept)
774 		goto retry;
775 
776 	/* Get rid of the page cache */
777 	invalidate_inode_pages(bdev->bd_inode);
778 }
779 
__invalidate_buffers(kdev_t dev,int destroy_dirty_buffers)780 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
781 {
782 	struct block_device *bdev = bdget(dev);
783 	if (bdev) {
784 		invalidate_bdev(bdev, destroy_dirty_buffers);
785 		bdput(bdev);
786 	}
787 }
788 
free_more_memory(void)789 static void free_more_memory(void)
790 {
791 	balance_dirty();
792 	wakeup_bdflush();
793 	try_to_free_pages(GFP_NOIO);
794 	run_task_queue(&tq_disk);
795 	yield();
796 }
797 
init_buffer(struct buffer_head * bh,bh_end_io_t * handler,void * private)798 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
799 {
800 	bh->b_list = BUF_CLEAN;
801 	bh->b_end_io = handler;
802 	bh->b_private = private;
803 }
804 
end_buffer_io_async(struct buffer_head * bh,int uptodate)805 void end_buffer_io_async(struct buffer_head * bh, int uptodate)
806 {
807 	static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
808 	unsigned long flags;
809 	struct buffer_head *tmp;
810 	struct page *page;
811 	int fullup = 1;
812 
813 	mark_buffer_uptodate(bh, uptodate);
814 
815 	/* This is a temporary buffer used for page I/O. */
816 	page = bh->b_page;
817 
818 	if (!uptodate)
819 		SetPageError(page);
820 
821 	/*
822 	 * Be _very_ careful from here on. Bad things can happen if
823 	 * two buffer heads end IO at almost the same time and both
824 	 * decide that the page is now completely done.
825 	 *
826 	 * Async buffer_heads are here only as labels for IO, and get
827 	 * thrown away once the IO for this page is complete.  IO is
828 	 * deemed complete once all buffers have been visited
829 	 * (b_count==0) and are now unlocked. We must make sure that
830 	 * only the _last_ buffer that decrements its count is the one
831 	 * that unlock the page..
832 	 */
833 	spin_lock_irqsave(&page_uptodate_lock, flags);
834 	mark_buffer_async(bh, 0);
835 	unlock_buffer(bh);
836 	tmp = bh->b_this_page;
837 	while (tmp != bh) {
838 		if (buffer_locked(tmp)) {
839 			if (buffer_async(tmp))
840 				goto still_busy;
841 		} else if (!buffer_uptodate(tmp))
842 			fullup = 0;
843 		tmp = tmp->b_this_page;
844 	}
845 
846 	/* OK, the async IO on this page is complete. */
847 	spin_unlock_irqrestore(&page_uptodate_lock, flags);
848 
849 	/*
850 	 * If none of the buffers had errors and all were uptodate
851 	 * then we can set the page uptodate:
852 	 */
853 	if (fullup && !PageError(page))
854 		SetPageUptodate(page);
855 
856 	UnlockPage(page);
857 
858 	return;
859 
860 still_busy:
861 	spin_unlock_irqrestore(&page_uptodate_lock, flags);
862 	return;
863 }
864 
set_buffer_async_io(struct buffer_head * bh)865 inline void set_buffer_async_io(struct buffer_head *bh)
866 {
867 	bh->b_end_io = end_buffer_io_async;
868 	mark_buffer_async(bh, 1);
869 }
870 
871 /*
872  * Synchronise all the inode's dirty buffers to the disk.
873  *
874  * We have conflicting pressures: we want to make sure that all
875  * initially dirty buffers get waited on, but that any subsequently
876  * dirtied buffers don't.  After all, we don't want fsync to last
877  * forever if somebody is actively writing to the file.
878  *
879  * Do this in two main stages: first we copy dirty buffers to a
880  * temporary inode list, queueing the writes as we go.  Then we clean
881  * up, waiting for those writes to complete.
882  *
883  * During this second stage, any subsequent updates to the file may end
884  * up refiling the buffer on the original inode's dirty list again, so
885  * there is a chance we will end up with a buffer queued for write but
886  * not yet completed on that list.  So, as a final cleanup we go through
887  * the osync code to catch these locked, dirty buffers without requeuing
888  * any newly dirty buffers for write.
889  */
fsync_buffers_list(struct list_head * list)890 int fsync_buffers_list(struct list_head *list)
891 {
892 	struct buffer_head *bh;
893 	struct list_head tmp;
894 	int err = 0, err2;
895 
896 	INIT_LIST_HEAD(&tmp);
897 
898 	spin_lock(&lru_list_lock);
899 
900 	while (!list_empty(list)) {
901 		bh = BH_ENTRY(list->next);
902 		list_del(&bh->b_inode_buffers);
903 		if (!buffer_dirty(bh) && !buffer_locked(bh))
904 			clear_buffer_attached(bh);
905 		else {
906 			set_buffer_attached(bh);
907 			list_add(&bh->b_inode_buffers, &tmp);
908 			if (buffer_dirty(bh)) {
909 				get_bh(bh);
910 				spin_unlock(&lru_list_lock);
911 			/*
912 			 * Wait I/O completion before submitting
913 			 * the buffer, to be sure the write will
914 			 * be effective on the latest data in
915 			 * the buffer. (otherwise - if there's old
916 			 * I/O in flight - write_buffer would become
917 			 * a noop)
918 			 */
919 				wait_on_buffer(bh);
920 				write_buffer(bh);
921 				brelse(bh);
922 				spin_lock(&lru_list_lock);
923 			}
924 		}
925 	}
926 
927 	while (!list_empty(&tmp)) {
928 		bh = BH_ENTRY(tmp.prev);
929 		remove_inode_queue(bh);
930 		get_bh(bh);
931 		spin_unlock(&lru_list_lock);
932 		wait_on_buffer(bh);
933 		if (!buffer_uptodate(bh))
934 			err = -EIO;
935 		brelse(bh);
936 		spin_lock(&lru_list_lock);
937 	}
938 
939 	spin_unlock(&lru_list_lock);
940 	err2 = osync_buffers_list(list);
941 
942 	if (err)
943 		return err;
944 	else
945 		return err2;
946 }
947 
948 /*
949  * osync is designed to support O_SYNC io.  It waits synchronously for
950  * all already-submitted IO to complete, but does not queue any new
951  * writes to the disk.
952  *
953  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
954  * you dirty the buffers, and then use osync_buffers_list to wait for
955  * completion.  Any other dirty buffers which are not yet queued for
956  * write will not be flushed to disk by the osync.
957  */
osync_buffers_list(struct list_head * list)958 static int osync_buffers_list(struct list_head *list)
959 {
960 	struct buffer_head *bh;
961 	struct list_head *p;
962 	int err = 0;
963 
964 	spin_lock(&lru_list_lock);
965 
966  repeat:
967 	list_for_each_prev(p, list) {
968 		bh = BH_ENTRY(p);
969 		if (buffer_locked(bh)) {
970 			get_bh(bh);
971 			spin_unlock(&lru_list_lock);
972 			wait_on_buffer(bh);
973 			if (!buffer_uptodate(bh))
974 				err = -EIO;
975 			brelse(bh);
976 			spin_lock(&lru_list_lock);
977 			goto repeat;
978 		}
979 	}
980 
981 	spin_unlock(&lru_list_lock);
982 	return err;
983 }
984 
985 /*
986  * Invalidate any and all dirty buffers on a given inode.  We are
987  * probably unmounting the fs, but that doesn't mean we have already
988  * done a sync().  Just drop the buffers from the inode list.
989  */
invalidate_inode_buffers(struct inode * inode)990 void invalidate_inode_buffers(struct inode *inode)
991 {
992 	struct list_head * entry;
993 
994 	spin_lock(&lru_list_lock);
995 	while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers)
996 		remove_inode_queue(BH_ENTRY(entry));
997 	while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers)
998 		remove_inode_queue(BH_ENTRY(entry));
999 	spin_unlock(&lru_list_lock);
1000 }
1001 
1002 
1003 /*
1004  * Ok, this is getblk, and it isn't very clear, again to hinder
1005  * race-conditions. Most of the code is seldom used, (ie repeating),
1006  * so it should be much more efficient than it looks.
1007  *
1008  * The algorithm is changed: hopefully better, and an elusive bug removed.
1009  *
1010  * 14.02.92: changed it to sync dirty buffers a bit: better performance
1011  * when the filesystem starts to get full of dirty blocks (I hope).
1012  */
getblk(kdev_t dev,int block,int size)1013 struct buffer_head * getblk(kdev_t dev, int block, int size)
1014 {
1015 	for (;;) {
1016 		struct buffer_head * bh;
1017 
1018 		bh = get_hash_table(dev, block, size);
1019 		if (bh) {
1020 			touch_buffer(bh);
1021 			return bh;
1022 		}
1023 
1024 		if (!grow_buffers(dev, block, size))
1025 			free_more_memory();
1026 	}
1027 }
1028 
1029 /* -1 -> no need to flush
1030     0 -> async flush
1031     1 -> sync flush (wait for I/O completion) */
balance_dirty_state(void)1032 static int balance_dirty_state(void)
1033 {
1034 	unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
1035 
1036 	dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1037 	tot = nr_free_buffer_pages();
1038 
1039 	dirty *= 100;
1040 	soft_dirty_limit = tot * bdf_prm.b_un.nfract;
1041 	hard_dirty_limit = tot * bdf_prm.b_un.nfract_sync;
1042 
1043 	/* First, check for the "real" dirty limit. */
1044 	if (dirty > soft_dirty_limit) {
1045 		if (dirty > hard_dirty_limit && !(current->flags & PF_NOIO))
1046 			return 1;
1047 		return 0;
1048 	}
1049 
1050 	return -1;
1051 }
1052 
bdflush_stop(void)1053 static int bdflush_stop(void)
1054 {
1055 	unsigned long dirty, tot, dirty_limit;
1056 
1057 	dirty = size_buffers_type[BUF_DIRTY] >> (PAGE_SHIFT - 9);
1058 	tot = nr_free_buffer_pages();
1059 
1060 	dirty *= 100;
1061 	dirty_limit = tot * bdf_prm.b_un.nfract_stop_bdflush;
1062 
1063 	if (!laptop_mode && dirty > dirty_limit)
1064 		return 0;
1065 	return 1;
1066 }
1067 
1068 /*
1069  * if a new dirty buffer is created we need to balance bdflush.
1070  *
1071  * in the future we might want to make bdflush aware of different
1072  * pressures on different devices - thus the (currently unused)
1073  * 'dev' parameter.
1074  */
balance_dirty(void)1075 void balance_dirty(void)
1076 {
1077 	int state = balance_dirty_state();
1078 
1079 	if (state < 0)
1080 		return;
1081 
1082 	wakeup_bdflush();
1083 
1084 	/*
1085 	 * And if we're _really_ out of balance, wait for
1086 	 * some of the dirty/locked buffers ourselves.
1087 	 * This will throttle heavy writers.
1088 	 */
1089 	if (state > 0) {
1090 		spin_lock(&lru_list_lock);
1091 		write_some_buffers(NODEV);
1092 	}
1093 }
1094 EXPORT_SYMBOL(balance_dirty);
1095 
__mark_dirty(struct buffer_head * bh)1096 inline void fastcall __mark_dirty(struct buffer_head *bh)
1097 {
1098 	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1099 	refile_buffer(bh);
1100 }
1101 
1102 /* atomic version, the user must call balance_dirty() by hand
1103    as soon as it become possible to block */
__mark_buffer_dirty(struct buffer_head * bh)1104 void fastcall __mark_buffer_dirty(struct buffer_head *bh)
1105 {
1106 	if (!atomic_set_buffer_dirty(bh))
1107 		__mark_dirty(bh);
1108 }
1109 
mark_buffer_dirty(struct buffer_head * bh)1110 void fastcall mark_buffer_dirty(struct buffer_head *bh)
1111 {
1112 	if (!atomic_set_buffer_dirty(bh)) {
1113 		if (block_dump)
1114 			printk("%s: dirtied buffer\n", current->comm);
1115 		__mark_dirty(bh);
1116 		balance_dirty();
1117 	}
1118 }
1119 
set_buffer_flushtime(struct buffer_head * bh)1120 void set_buffer_flushtime(struct buffer_head *bh)
1121 {
1122 	bh->b_flushtime = jiffies + bdf_prm.b_un.age_buffer;
1123 }
1124 EXPORT_SYMBOL(set_buffer_flushtime);
1125 
get_buffer_flushtime(void)1126 int get_buffer_flushtime(void)
1127 {
1128 	return bdf_prm.b_un.interval;
1129 }
1130 EXPORT_SYMBOL(get_buffer_flushtime);
1131 
1132 /*
1133  * A buffer may need to be moved from one buffer list to another
1134  * (e.g. in case it is not shared any more). Handle this.
1135  */
__refile_buffer(struct buffer_head * bh)1136 static void __refile_buffer(struct buffer_head *bh)
1137 {
1138 	int dispose = BUF_CLEAN;
1139 	if (buffer_locked(bh))
1140 		dispose = BUF_LOCKED;
1141 	if (buffer_dirty(bh))
1142 		dispose = BUF_DIRTY;
1143 	if (dispose != bh->b_list) {
1144 		__remove_from_lru_list(bh);
1145 		bh->b_list = dispose;
1146 		if (dispose == BUF_CLEAN)
1147 			remove_inode_queue(bh);
1148 		__insert_into_lru_list(bh, dispose);
1149 	}
1150 }
1151 
refile_buffer(struct buffer_head * bh)1152 void refile_buffer(struct buffer_head *bh)
1153 {
1154 	spin_lock(&lru_list_lock);
1155 	__refile_buffer(bh);
1156 	spin_unlock(&lru_list_lock);
1157 }
1158 
1159 /*
1160  * Release a buffer head
1161  */
__brelse(struct buffer_head * buf)1162 void __brelse(struct buffer_head * buf)
1163 {
1164 	if (atomic_read(&buf->b_count)) {
1165 		put_bh(buf);
1166 		return;
1167 	}
1168 	printk(KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1169 }
1170 
1171 /*
1172  * bforget() is like brelse(), except it discards any
1173  * potentially dirty data.
1174  */
__bforget(struct buffer_head * buf)1175 void __bforget(struct buffer_head * buf)
1176 {
1177 	mark_buffer_clean(buf);
1178 	__brelse(buf);
1179 }
1180 
1181 /**
1182  *	bread() - reads a specified block and returns the bh
1183  *	@block: number of block
1184  *	@size: size (in bytes) to read
1185  *
1186  *	Reads a specified block, and returns buffer head that
1187  *	contains it. It returns NULL if the block was unreadable.
1188  */
bread(kdev_t dev,int block,int size)1189 struct buffer_head * bread(kdev_t dev, int block, int size)
1190 {
1191 	struct buffer_head * bh;
1192 
1193 	bh = getblk(dev, block, size);
1194 	if (buffer_uptodate(bh))
1195 		return bh;
1196 	set_bit(BH_Sync, &bh->b_state);
1197 	ll_rw_block(READ, 1, &bh);
1198 	wait_on_buffer(bh);
1199 	if (buffer_uptodate(bh))
1200 		return bh;
1201 	brelse(bh);
1202 	return NULL;
1203 }
1204 
1205 /*
1206  * Note: the caller should wake up the buffer_wait list if needed.
1207  */
__put_unused_buffer_head(struct buffer_head * bh)1208 static void __put_unused_buffer_head(struct buffer_head * bh)
1209 {
1210 	if (unlikely(buffer_attached(bh)))
1211 		BUG();
1212 	if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) {
1213 		kmem_cache_free(bh_cachep, bh);
1214 	} else {
1215 		bh->b_dev = B_FREE;
1216 		bh->b_blocknr = -1;
1217 		bh->b_this_page = NULL;
1218 
1219 		nr_unused_buffer_heads++;
1220 		bh->b_next_free = unused_list;
1221 		unused_list = bh;
1222 	}
1223 }
1224 
put_unused_buffer_head(struct buffer_head * bh)1225 void put_unused_buffer_head(struct buffer_head *bh)
1226 {
1227 	spin_lock(&unused_list_lock);
1228 	__put_unused_buffer_head(bh);
1229 	spin_unlock(&unused_list_lock);
1230 }
1231 EXPORT_SYMBOL(put_unused_buffer_head);
1232 
1233 /*
1234  * Reserve NR_RESERVED buffer heads for async IO requests to avoid
1235  * no-buffer-head deadlock.  Return NULL on failure; waiting for
1236  * buffer heads is now handled in create_buffers().
1237  */
get_unused_buffer_head(int async)1238 struct buffer_head * get_unused_buffer_head(int async)
1239 {
1240 	struct buffer_head * bh;
1241 
1242 	spin_lock(&unused_list_lock);
1243 	if (nr_unused_buffer_heads > NR_RESERVED) {
1244 		bh = unused_list;
1245 		unused_list = bh->b_next_free;
1246 		nr_unused_buffer_heads--;
1247 		spin_unlock(&unused_list_lock);
1248 		return bh;
1249 	}
1250 	spin_unlock(&unused_list_lock);
1251 
1252 	/* This is critical.  We can't call out to the FS
1253 	 * to get more buffer heads, because the FS may need
1254 	 * more buffer-heads itself.  Thus SLAB_NOFS.
1255 	 */
1256 	if((bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS)) != NULL) {
1257 		bh->b_blocknr = -1;
1258 		bh->b_this_page = NULL;
1259 		return bh;
1260 	}
1261 
1262 	/*
1263 	 * If we need an async buffer, use the reserved buffer heads.
1264 	 * Non-PF_MEMALLOC tasks can just loop in create_buffers().
1265 	 */
1266 	if (async && (current->flags & PF_MEMALLOC)) {
1267 		spin_lock(&unused_list_lock);
1268 		if (unused_list) {
1269 			bh = unused_list;
1270 			unused_list = bh->b_next_free;
1271 			nr_unused_buffer_heads--;
1272 			spin_unlock(&unused_list_lock);
1273 			return bh;
1274 		}
1275 		spin_unlock(&unused_list_lock);
1276 	}
1277 
1278 	return NULL;
1279 }
1280 EXPORT_SYMBOL(get_unused_buffer_head);
1281 
set_bh_page(struct buffer_head * bh,struct page * page,unsigned long offset)1282 void set_bh_page (struct buffer_head *bh, struct page *page, unsigned long offset)
1283 {
1284 	if (offset >= PAGE_SIZE)
1285 		BUG();
1286 
1287 	if (PageHighMem(page)) {
1288 		bh->b_data = (char *)offset;
1289 	} else {
1290 		bh->b_data = page_address(page) + offset;
1291 	}
1292 	bh->b_page = page;
1293 }
1294 EXPORT_SYMBOL(set_bh_page);
1295 
1296 /*
1297  * Create the appropriate buffers when given a page for data area and
1298  * the size of each buffer.. Use the bh->b_this_page linked list to
1299  * follow the buffers created.  Return NULL if unable to create more
1300  * buffers.
1301  * The async flag is used to differentiate async IO (paging, swapping)
1302  * from ordinary buffer allocations, and only async requests are allowed
1303  * to sleep waiting for buffer heads.
1304  */
create_buffers(struct page * page,unsigned long size,int async)1305 static struct buffer_head * create_buffers(struct page * page, unsigned long size, int async)
1306 {
1307 	struct buffer_head *bh, *head;
1308 	long offset;
1309 
1310 try_again:
1311 	head = NULL;
1312 	offset = PAGE_SIZE;
1313 	while ((offset -= size) >= 0) {
1314 		bh = get_unused_buffer_head(async);
1315 		if (!bh)
1316 			goto no_grow;
1317 
1318 		bh->b_dev = NODEV;
1319 		bh->b_this_page = head;
1320 		head = bh;
1321 
1322 		bh->b_state = 0;
1323 		bh->b_next_free = NULL;
1324 		bh->b_pprev = NULL;
1325 		atomic_set(&bh->b_count, 0);
1326 		bh->b_size = size;
1327 
1328 		set_bh_page(bh, page, offset);
1329 
1330 		bh->b_list = BUF_CLEAN;
1331 		bh->b_end_io = NULL;
1332 	}
1333 	return head;
1334 /*
1335  * In case anything failed, we just free everything we got.
1336  */
1337 no_grow:
1338 	if (head) {
1339 		spin_lock(&unused_list_lock);
1340 		do {
1341 			bh = head;
1342 			head = head->b_this_page;
1343 			__put_unused_buffer_head(bh);
1344 		} while (head);
1345 		spin_unlock(&unused_list_lock);
1346 
1347 		/* Wake up any waiters ... */
1348 		wake_up(&buffer_wait);
1349 	}
1350 
1351 	/*
1352 	 * Return failure for non-async IO requests.  Async IO requests
1353 	 * are not allowed to fail, so we have to wait until buffer heads
1354 	 * become available.  But we don't want tasks sleeping with
1355 	 * partially complete buffers, so all were released above.
1356 	 */
1357 	if (!async)
1358 		return NULL;
1359 
1360 	/* We're _really_ low on memory. Now we just
1361 	 * wait for old buffer heads to become free due to
1362 	 * finishing IO.  Since this is an async request and
1363 	 * the reserve list is empty, we're sure there are
1364 	 * async buffer heads in use.
1365 	 */
1366 	run_task_queue(&tq_disk);
1367 
1368 	free_more_memory();
1369 	goto try_again;
1370 }
1371 
1372 /*
1373  * Called when truncating a buffer on a page completely.
1374  */
discard_buffer(struct buffer_head * bh)1375 static void discard_buffer(struct buffer_head * bh)
1376 {
1377 	if (buffer_mapped(bh) || buffer_delay(bh)) {
1378 		mark_buffer_clean(bh);
1379 		lock_buffer(bh);
1380 		clear_bit(BH_Uptodate, &bh->b_state);
1381 		clear_bit(BH_Mapped, &bh->b_state);
1382 		clear_bit(BH_Req, &bh->b_state);
1383 		clear_bit(BH_New, &bh->b_state);
1384 		clear_bit(BH_Delay, &bh->b_state);
1385 		remove_from_queues(bh);
1386 		unlock_buffer(bh);
1387 	}
1388 }
1389 
1390 /**
1391  * try_to_release_page - release old fs-specific metadata on a page
1392  *
1393  */
1394 
try_to_release_page(struct page * page,int gfp_mask)1395 int try_to_release_page(struct page * page, int gfp_mask)
1396 {
1397 	if (!PageLocked(page))
1398 		BUG();
1399 
1400 	if (!page->mapping)
1401 		goto try_to_free;
1402 	if (!page->mapping->a_ops->releasepage)
1403 		goto try_to_free;
1404 	if (page->mapping->a_ops->releasepage(page, gfp_mask))
1405 		goto try_to_free;
1406 	/*
1407 	 * We couldn't release buffer metadata; don't even bother trying
1408 	 * to release buffers.
1409 	 */
1410 	return 0;
1411 try_to_free:
1412 	return try_to_free_buffers(page, gfp_mask);
1413 }
1414 
1415 /*
1416  * We don't have to release all buffers here, but
1417  * we have to be sure that no dirty buffer is left
1418  * and no IO is going on (no buffer is locked), because
1419  * we have truncated the file and are going to free the
1420  * blocks on-disk..
1421  */
discard_bh_page(struct page * page,unsigned long offset,int drop_pagecache)1422 int discard_bh_page(struct page *page, unsigned long offset, int drop_pagecache)
1423 {
1424 	struct buffer_head *head, *bh, *next;
1425 	unsigned int curr_off = 0;
1426 
1427 	if (!PageLocked(page))
1428 		BUG();
1429 	if (!page->buffers)
1430 		return 1;
1431 
1432 	head = page->buffers;
1433 	bh = head;
1434 	do {
1435 		unsigned int next_off = curr_off + bh->b_size;
1436 		next = bh->b_this_page;
1437 
1438 		/*
1439 		 * is this block fully flushed?
1440 		 */
1441 		if (offset <= curr_off)
1442 			discard_buffer(bh);
1443 		curr_off = next_off;
1444 		bh = next;
1445 	} while (bh != head);
1446 
1447 	/*
1448 	 * subtle. We release buffer-heads only if this is
1449 	 * the 'final' flushpage. We have invalidated the get_block
1450 	 * cached value unconditionally, so real IO is not
1451 	 * possible anymore.
1452 	 *
1453 	 * If the free doesn't work out, the buffers can be
1454 	 * left around - they just turn into anonymous buffers
1455 	 * instead.
1456 	 */
1457 	if (!offset) {
1458 		if (!try_to_release_page(page, 0))
1459 			return 0;
1460 	}
1461 
1462 	return 1;
1463 }
1464 
create_empty_buffers(struct page * page,kdev_t dev,unsigned long blocksize)1465 void create_empty_buffers(struct page *page, kdev_t dev, unsigned long blocksize)
1466 {
1467 	struct buffer_head *bh, *head, *tail;
1468 
1469 	/* FIXME: create_buffers should fail if there's no enough memory */
1470 	head = create_buffers(page, blocksize, 1);
1471 	if (page->buffers)
1472 		BUG();
1473 
1474 	bh = head;
1475 	do {
1476 		bh->b_dev = dev;
1477 		bh->b_blocknr = 0;
1478 		bh->b_end_io = NULL;
1479 		tail = bh;
1480 		bh = bh->b_this_page;
1481 	} while (bh);
1482 	tail->b_this_page = head;
1483 	page->buffers = head;
1484 	page_cache_get(page);
1485 }
1486 EXPORT_SYMBOL(create_empty_buffers);
1487 
1488 /*
1489  * We are taking a block for data and we don't want any output from any
1490  * buffer-cache aliases starting from return from that function and
1491  * until the moment when something will explicitly mark the buffer
1492  * dirty (hopefully that will not happen until we will free that block ;-)
1493  * We don't even need to mark it not-uptodate - nobody can expect
1494  * anything from a newly allocated buffer anyway. We used to used
1495  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1496  * don't want to mark the alias unmapped, for example - it would confuse
1497  * anyone who might pick it with bread() afterwards...
1498  */
1499 
unmap_underlying_metadata(struct buffer_head * bh)1500 static void unmap_underlying_metadata(struct buffer_head * bh)
1501 {
1502 	struct buffer_head *old_bh;
1503 
1504 	old_bh = get_hash_table(bh->b_dev, bh->b_blocknr, bh->b_size);
1505 	if (old_bh) {
1506 		mark_buffer_clean(old_bh);
1507 		wait_on_buffer(old_bh);
1508 		clear_bit(BH_Req, &old_bh->b_state);
1509 		__brelse(old_bh);
1510 	}
1511 }
1512 
1513 /*
1514  * NOTE! All mapped/uptodate combinations are valid:
1515  *
1516  *	Mapped	Uptodate	Meaning
1517  *
1518  *	No	No		"unknown" - must do get_block()
1519  *	No	Yes		"hole" - zero-filled
1520  *	Yes	No		"allocated" - allocated on disk, not read in
1521  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1522  *
1523  * "Dirty" is valid only with the last case (mapped+uptodate).
1524  */
1525 
1526 /*
1527  * block_write_full_page() is SMP threaded - the kernel lock is not held.
1528  */
__block_write_full_page(struct inode * inode,struct page * page,get_block_t * get_block)1529 static int __block_write_full_page(struct inode *inode, struct page *page, get_block_t *get_block)
1530 {
1531 	int err, i;
1532 	unsigned long block;
1533 	struct buffer_head *bh, *head;
1534 	int need_unlock;
1535 
1536 	if (!PageLocked(page))
1537 		BUG();
1538 
1539 	if (!page->buffers)
1540 		create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1541 	head = page->buffers;
1542 
1543 	block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1544 
1545 	bh = head;
1546 	i = 0;
1547 
1548 	/* Stage 1: make sure we have all the buffers mapped! */
1549 	do {
1550 		/*
1551 		 * If the buffer isn't up-to-date, we can't be sure
1552 		 * that the buffer has been initialized with the proper
1553 		 * block number information etc..
1554 		 *
1555 		 * Leave it to the low-level FS to make all those
1556 		 * decisions (block #0 may actually be a valid block)
1557 		 */
1558 		if (!buffer_mapped(bh)) {
1559 			err = get_block(inode, block, bh, 1);
1560 			if (err)
1561 				goto out;
1562 			if (buffer_new(bh))
1563 				unmap_underlying_metadata(bh);
1564 		}
1565 		bh = bh->b_this_page;
1566 		block++;
1567 	} while (bh != head);
1568 
1569 	/* Stage 2: lock the buffers, mark them clean */
1570 	do {
1571 		lock_buffer(bh);
1572 		set_buffer_async_io(bh);
1573 		set_bit(BH_Uptodate, &bh->b_state);
1574 		clear_bit(BH_Dirty, &bh->b_state);
1575 		bh = bh->b_this_page;
1576 	} while (bh != head);
1577 
1578 	/* Stage 3: submit the IO */
1579 	do {
1580 		struct buffer_head *next = bh->b_this_page;
1581 		submit_bh(WRITE, bh);
1582 		bh = next;
1583 	} while (bh != head);
1584 
1585 	/* Done - end_buffer_io_async will unlock */
1586 	SetPageUptodate(page);
1587 
1588 	wakeup_page_waiters(page);
1589 
1590 	return 0;
1591 
1592 out:
1593 	/*
1594 	 * ENOSPC, or some other error.  We may already have added some
1595 	 * blocks to the file, so we need to write these out to avoid
1596 	 * exposing stale data.
1597 	 */
1598 	ClearPageUptodate(page);
1599 	bh = head;
1600 	need_unlock = 1;
1601 	/* Recovery: lock and submit the mapped buffers */
1602 	do {
1603 		if (buffer_mapped(bh)) {
1604 			lock_buffer(bh);
1605 			set_buffer_async_io(bh);
1606 			need_unlock = 0;
1607 		}
1608 		bh = bh->b_this_page;
1609 	} while (bh != head);
1610 	do {
1611 		struct buffer_head *next = bh->b_this_page;
1612 		if (buffer_mapped(bh)) {
1613 			set_bit(BH_Uptodate, &bh->b_state);
1614 			clear_bit(BH_Dirty, &bh->b_state);
1615 			submit_bh(WRITE, bh);
1616 		}
1617 		bh = next;
1618 	} while (bh != head);
1619 	if (need_unlock)
1620 		UnlockPage(page);
1621 	wakeup_page_waiters(page);
1622 	return err;
1623 }
1624 
__block_prepare_write(struct inode * inode,struct page * page,unsigned from,unsigned to,get_block_t * get_block)1625 static int __block_prepare_write(struct inode *inode, struct page *page,
1626 		unsigned from, unsigned to, get_block_t *get_block)
1627 {
1628 	unsigned block_start, block_end;
1629 	unsigned long block;
1630 	int err = 0;
1631 	unsigned blocksize, bbits;
1632 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1633 	char *kaddr = kmap(page);
1634 
1635 	blocksize = 1 << inode->i_blkbits;
1636 	if (!page->buffers)
1637 		create_empty_buffers(page, inode->i_dev, blocksize);
1638 	head = page->buffers;
1639 
1640 	bbits = inode->i_blkbits;
1641 	block = page->index << (PAGE_CACHE_SHIFT - bbits);
1642 
1643 	for(bh = head, block_start = 0; bh != head || !block_start;
1644 	    block++, block_start=block_end, bh = bh->b_this_page) {
1645 		if (!bh)
1646 			BUG();
1647 		block_end = block_start+blocksize;
1648 		if (block_end <= from)
1649 			continue;
1650 		if (block_start >= to)
1651 			break;
1652 		clear_bit(BH_New, &bh->b_state);
1653 		if (!buffer_mapped(bh)) {
1654 			err = get_block(inode, block, bh, 1);
1655 			if (err)
1656 				goto out;
1657 			if (buffer_new(bh)) {
1658 				unmap_underlying_metadata(bh);
1659 				if (Page_Uptodate(page)) {
1660 					set_bit(BH_Uptodate, &bh->b_state);
1661 					continue;
1662 				}
1663 				if (block_end > to)
1664 					memset(kaddr+to, 0, block_end-to);
1665 				if (block_start < from)
1666 					memset(kaddr+block_start, 0, from-block_start);
1667 				if (block_end > to || block_start < from)
1668 					flush_dcache_page(page);
1669 				continue;
1670 			}
1671 		}
1672 		if (Page_Uptodate(page)) {
1673 			set_bit(BH_Uptodate, &bh->b_state);
1674 			continue;
1675 		}
1676 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1677 		     (block_start < from || block_end > to)) {
1678 			ll_rw_block(READ, 1, &bh);
1679 			*wait_bh++=bh;
1680 		}
1681 	}
1682 	/*
1683 	 * If we issued read requests - let them complete.
1684 	 */
1685 	while(wait_bh > wait) {
1686 		wait_on_buffer(*--wait_bh);
1687 		if (!buffer_uptodate(*wait_bh))
1688 			return -EIO;
1689 	}
1690 	return 0;
1691 out:
1692 	/*
1693 	 * Zero out any newly allocated blocks to avoid exposing stale
1694 	 * data.  If BH_New is set, we know that the block was newly
1695 	 * allocated in the above loop.
1696 	 *
1697 	 * Details the buffer can be new and uptodate because:
1698 	 * 1) hole in uptodate page, get_block(create) allocate the block,
1699 	 *    so the buffer is new and additionally we also mark it uptodate
1700 	 * 2) The buffer is not mapped and uptodate due a previous partial read.
1701 	 *
1702 	 * We can always ignore uptodate buffers here, if you mark a buffer
1703 	 * uptodate you must make sure it contains the right data first.
1704 	 *
1705 	 * We must stop the "undo/clear" fixup pass not at the caller "to"
1706 	 * but at the last block that we successfully arrived in the main loop.
1707 	 */
1708 	bh = head;
1709 	to = block_start; /* stop at the last successfully handled block */
1710 	block_start = 0;
1711 	do {
1712 		block_end = block_start+blocksize;
1713 		if (block_end <= from)
1714 			goto next_bh;
1715 		if (block_start >= to)
1716 			break;
1717 		if (buffer_new(bh) && !buffer_uptodate(bh)) {
1718 			memset(kaddr+block_start, 0, bh->b_size);
1719 			flush_dcache_page(page);
1720 			set_bit(BH_Uptodate, &bh->b_state);
1721 			mark_buffer_dirty(bh);
1722 		}
1723 next_bh:
1724 		block_start = block_end;
1725 		bh = bh->b_this_page;
1726 	} while (bh != head);
1727 	return err;
1728 }
1729 
__block_commit_write(struct inode * inode,struct page * page,unsigned from,unsigned to)1730 static int __block_commit_write(struct inode *inode, struct page *page,
1731 		unsigned from, unsigned to)
1732 {
1733 	unsigned block_start, block_end;
1734 	int partial = 0, need_balance_dirty = 0;
1735 	unsigned blocksize;
1736 	struct buffer_head *bh, *head;
1737 
1738 	blocksize = 1 << inode->i_blkbits;
1739 
1740 	for(bh = head = page->buffers, block_start = 0;
1741 	    bh != head || !block_start;
1742 	    block_start=block_end, bh = bh->b_this_page) {
1743 		block_end = block_start + blocksize;
1744 		if (block_end <= from || block_start >= to) {
1745 			if (!buffer_uptodate(bh))
1746 				partial = 1;
1747 		} else {
1748 			set_bit(BH_Uptodate, &bh->b_state);
1749 			if (!atomic_set_buffer_dirty(bh)) {
1750 				__mark_dirty(bh);
1751 				buffer_insert_inode_data_queue(bh, inode);
1752 				need_balance_dirty = 1;
1753 			}
1754 		}
1755 	}
1756 
1757 	if (need_balance_dirty)
1758 		balance_dirty();
1759 	/*
1760 	 * is this a partial write that happened to make all buffers
1761 	 * uptodate then we can optimize away a bogus readpage() for
1762 	 * the next read(). Here we 'discover' wether the page went
1763 	 * uptodate as a result of this (potentially partial) write.
1764 	 */
1765 	if (!partial)
1766 		SetPageUptodate(page);
1767 	return 0;
1768 }
1769 
1770 /*
1771  * Generic "read page" function for block devices that have the normal
1772  * get_block functionality. This is most of the block device filesystems.
1773  * Reads the page asynchronously --- the unlock_buffer() and
1774  * mark_buffer_uptodate() functions propagate buffer state into the
1775  * page struct once IO has completed.
1776  */
block_read_full_page(struct page * page,get_block_t * get_block)1777 int block_read_full_page(struct page *page, get_block_t *get_block)
1778 {
1779 	struct inode *inode = page->mapping->host;
1780 	unsigned long iblock, lblock;
1781 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
1782 	unsigned int blocksize, blocks;
1783 	int nr, i;
1784 
1785 	if (!PageLocked(page))
1786 		PAGE_BUG(page);
1787 	blocksize = 1 << inode->i_blkbits;
1788 	if (!page->buffers)
1789 		create_empty_buffers(page, inode->i_dev, blocksize);
1790 	head = page->buffers;
1791 
1792 	blocks = PAGE_CACHE_SIZE >> inode->i_blkbits;
1793 	iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1794 	lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits;
1795 	bh = head;
1796 	nr = 0;
1797 	i = 0;
1798 
1799 	do {
1800 		if (buffer_uptodate(bh))
1801 			continue;
1802 
1803 		if (!buffer_mapped(bh)) {
1804 			if (iblock < lblock) {
1805 				if (get_block(inode, iblock, bh, 0))
1806 					SetPageError(page);
1807 			}
1808 			if (!buffer_mapped(bh)) {
1809 				memset(kmap(page) + i*blocksize, 0, blocksize);
1810 				flush_dcache_page(page);
1811 				kunmap(page);
1812 				set_bit(BH_Uptodate, &bh->b_state);
1813 				continue;
1814 			}
1815 			/* get_block() might have updated the buffer synchronously */
1816 			if (buffer_uptodate(bh))
1817 				continue;
1818 		}
1819 
1820 		arr[nr] = bh;
1821 		nr++;
1822 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
1823 
1824 	if (!nr) {
1825 		/*
1826 		 * All buffers are uptodate - we can set the page uptodate
1827 		 * as well. But not if get_block() returned an error.
1828 		 */
1829 		if (!PageError(page))
1830 			SetPageUptodate(page);
1831 		UnlockPage(page);
1832 		return 0;
1833 	}
1834 
1835 	/* Stage two: lock the buffers */
1836 	for (i = 0; i < nr; i++) {
1837 		struct buffer_head * bh = arr[i];
1838 		lock_buffer(bh);
1839 		set_buffer_async_io(bh);
1840 	}
1841 
1842 	/* Stage 3: start the IO */
1843 	for (i = 0; i < nr; i++) {
1844 		struct buffer_head * bh = arr[i];
1845 		if (buffer_uptodate(bh))
1846 			end_buffer_io_async(bh, 1);
1847 		else
1848 			submit_bh(READ, bh);
1849 	}
1850 
1851 	wakeup_page_waiters(page);
1852 
1853 	return 0;
1854 }
1855 
1856 /* utility function for filesystems that need to do work on expanding
1857  * truncates.  Uses prepare/commit_write to allow the filesystem to
1858  * deal with the hole.
1859  */
generic_cont_expand(struct inode * inode,loff_t size)1860 int generic_cont_expand(struct inode *inode, loff_t size)
1861 {
1862 	struct address_space *mapping = inode->i_mapping;
1863 	struct page *page;
1864 	unsigned long index, offset, limit;
1865 	int err;
1866 
1867 	err = -EFBIG;
1868         limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
1869 	if (limit != RLIM_INFINITY && size > (loff_t)limit) {
1870 		send_sig(SIGXFSZ, current, 0);
1871 		goto out;
1872 	}
1873 	if (size > inode->i_sb->s_maxbytes)
1874 		goto out;
1875 
1876 	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
1877 
1878 	/* ugh.  in prepare/commit_write, if from==to==start of block, we
1879 	** skip the prepare.  make sure we never send an offset for the start
1880 	** of a block
1881 	*/
1882 	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
1883 		offset++;
1884 	}
1885 	index = size >> PAGE_CACHE_SHIFT;
1886 	err = -ENOMEM;
1887 	page = grab_cache_page(mapping, index);
1888 	if (!page)
1889 		goto out;
1890 	err = mapping->a_ops->prepare_write(NULL, page, offset, offset);
1891 	if (!err) {
1892 		err = mapping->a_ops->commit_write(NULL, page, offset, offset);
1893 	}
1894 	UnlockPage(page);
1895 	page_cache_release(page);
1896 	if (err > 0)
1897 		err = 0;
1898 out:
1899 	return err;
1900 }
1901 
1902 /*
1903  * For moronic filesystems that do not allow holes in file.
1904  * We may have to extend the file.
1905  */
1906 
cont_prepare_write(struct page * page,unsigned offset,unsigned to,get_block_t * get_block,unsigned long * bytes)1907 int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, unsigned long *bytes)
1908 {
1909 	struct address_space *mapping = page->mapping;
1910 	struct inode *inode = mapping->host;
1911 	struct page *new_page;
1912 	unsigned long pgpos;
1913 	long status;
1914 	unsigned zerofrom;
1915 	unsigned blocksize = 1 << inode->i_blkbits;
1916 	char *kaddr;
1917 
1918 	while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) {
1919 		status = -ENOMEM;
1920 		new_page = grab_cache_page(mapping, pgpos);
1921 		if (!new_page)
1922 			goto out;
1923 		/* we might sleep */
1924 		if (*bytes>>PAGE_CACHE_SHIFT != pgpos) {
1925 			UnlockPage(new_page);
1926 			page_cache_release(new_page);
1927 			continue;
1928 		}
1929 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
1930 		if (zerofrom & (blocksize-1)) {
1931 			*bytes |= (blocksize-1);
1932 			(*bytes)++;
1933 		}
1934 		status = __block_prepare_write(inode, new_page, zerofrom,
1935 						PAGE_CACHE_SIZE, get_block);
1936 		if (status)
1937 			goto out_unmap;
1938 		kaddr = page_address(new_page);
1939 		memset(kaddr+zerofrom, 0, PAGE_CACHE_SIZE-zerofrom);
1940 		flush_dcache_page(new_page);
1941 		__block_commit_write(inode, new_page, zerofrom, PAGE_CACHE_SIZE);
1942 		kunmap(new_page);
1943 		UnlockPage(new_page);
1944 		page_cache_release(new_page);
1945 	}
1946 
1947 	if (page->index < pgpos) {
1948 		/* completely inside the area */
1949 		zerofrom = offset;
1950 	} else {
1951 		/* page covers the boundary, find the boundary offset */
1952 		zerofrom = *bytes & ~PAGE_CACHE_MASK;
1953 
1954 		/* if we will expand the thing last block will be filled */
1955 		if (to > zerofrom && (zerofrom & (blocksize-1))) {
1956 			*bytes |= (blocksize-1);
1957 			(*bytes)++;
1958 		}
1959 
1960 		/* starting below the boundary? Nothing to zero out */
1961 		if (offset <= zerofrom)
1962 			zerofrom = offset;
1963 	}
1964 	status = __block_prepare_write(inode, page, zerofrom, to, get_block);
1965 	if (status)
1966 		goto out1;
1967 	kaddr = page_address(page);
1968 	if (zerofrom < offset) {
1969 		memset(kaddr+zerofrom, 0, offset-zerofrom);
1970 		flush_dcache_page(page);
1971 		__block_commit_write(inode, page, zerofrom, offset);
1972 	}
1973 	return 0;
1974 out1:
1975 	ClearPageUptodate(page);
1976 	kunmap(page);
1977 	return status;
1978 
1979 out_unmap:
1980 	ClearPageUptodate(new_page);
1981 	kunmap(new_page);
1982 	UnlockPage(new_page);
1983 	page_cache_release(new_page);
1984 out:
1985 	return status;
1986 }
1987 
block_prepare_write(struct page * page,unsigned from,unsigned to,get_block_t * get_block)1988 int block_prepare_write(struct page *page, unsigned from, unsigned to,
1989 			get_block_t *get_block)
1990 {
1991 	struct inode *inode = page->mapping->host;
1992 	int err = __block_prepare_write(inode, page, from, to, get_block);
1993 	if (err) {
1994 		ClearPageUptodate(page);
1995 		kunmap(page);
1996 	}
1997 	return err;
1998 }
1999 
block_commit_write(struct page * page,unsigned from,unsigned to)2000 int block_commit_write(struct page *page, unsigned from, unsigned to)
2001 {
2002 	struct inode *inode = page->mapping->host;
2003 	__block_commit_write(inode,page,from,to);
2004 	kunmap(page);
2005 	return 0;
2006 }
2007 
generic_commit_write(struct file * file,struct page * page,unsigned from,unsigned to)2008 int generic_commit_write(struct file *file, struct page *page,
2009 		unsigned from, unsigned to)
2010 {
2011 	struct inode *inode = page->mapping->host;
2012 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2013 	__block_commit_write(inode,page,from,to);
2014 	kunmap(page);
2015 	if (pos > inode->i_size) {
2016 		inode->i_size = pos;
2017 		mark_inode_dirty(inode);
2018 	}
2019 	return 0;
2020 }
2021 
block_truncate_page(struct address_space * mapping,loff_t from,get_block_t * get_block)2022 int block_truncate_page(struct address_space *mapping, loff_t from, get_block_t *get_block)
2023 {
2024 	unsigned long index = from >> PAGE_CACHE_SHIFT;
2025 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2026 	unsigned blocksize, iblock, length, pos;
2027 	struct inode *inode = mapping->host;
2028 	struct page *page;
2029 	struct buffer_head *bh;
2030 	int err;
2031 
2032 	blocksize = 1 << inode->i_blkbits;
2033 	length = offset & (blocksize - 1);
2034 
2035 	/* Block boundary? Nothing to do */
2036 	if (!length)
2037 		return 0;
2038 
2039 	length = blocksize - length;
2040 	iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2041 
2042 	page = grab_cache_page(mapping, index);
2043 	err = -ENOMEM;
2044 	if (!page)
2045 		goto out;
2046 
2047 	if (!page->buffers)
2048 		create_empty_buffers(page, inode->i_dev, blocksize);
2049 
2050 	/* Find the buffer that contains "offset" */
2051 	bh = page->buffers;
2052 	pos = blocksize;
2053 	while (offset >= pos) {
2054 		bh = bh->b_this_page;
2055 		iblock++;
2056 		pos += blocksize;
2057 	}
2058 
2059 	err = 0;
2060 	if (!buffer_mapped(bh)) {
2061 		/* Hole? Nothing to do */
2062 		if (buffer_uptodate(bh))
2063 			goto unlock;
2064 		get_block(inode, iblock, bh, 0);
2065 		/* Still unmapped? Nothing to do */
2066 		if (!buffer_mapped(bh))
2067 			goto unlock;
2068 	}
2069 
2070 	/* Ok, it's mapped. Make sure it's up-to-date */
2071 	if (Page_Uptodate(page))
2072 		set_bit(BH_Uptodate, &bh->b_state);
2073 
2074 	if (!buffer_uptodate(bh) && !buffer_delay(bh)) {
2075 		err = -EIO;
2076 		ll_rw_block(READ, 1, &bh);
2077 		wait_on_buffer(bh);
2078 		/* Uhhuh. Read error. Complain and punt. */
2079 		if (!buffer_uptodate(bh))
2080 			goto unlock;
2081 	}
2082 
2083 	memset(kmap(page) + offset, 0, length);
2084 	flush_dcache_page(page);
2085 	kunmap(page);
2086 
2087 	if (!atomic_set_buffer_dirty(bh)) {
2088 		__mark_dirty(bh);
2089 		buffer_insert_inode_data_queue(bh, inode);
2090 		balance_dirty();
2091 	}
2092 
2093 	err = 0;
2094 
2095 unlock:
2096 	UnlockPage(page);
2097 	page_cache_release(page);
2098 out:
2099 	return err;
2100 }
2101 
block_write_full_page(struct page * page,get_block_t * get_block)2102 int block_write_full_page(struct page *page, get_block_t *get_block)
2103 {
2104 	struct inode *inode = page->mapping->host;
2105 	unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT;
2106 	unsigned offset;
2107 	int err;
2108 
2109 	/* easy case */
2110 	if (page->index < end_index)
2111 		return __block_write_full_page(inode, page, get_block);
2112 
2113 	/* things got complicated... */
2114 	offset = inode->i_size & (PAGE_CACHE_SIZE-1);
2115 	/* OK, are we completely out? */
2116 	if (page->index >= end_index+1 || !offset) {
2117 		UnlockPage(page);
2118 		return -EIO;
2119 	}
2120 
2121 	/* Sigh... will have to work, then... */
2122 	err = __block_prepare_write(inode, page, 0, offset, get_block);
2123 	if (!err) {
2124 		memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
2125 		flush_dcache_page(page);
2126 		__block_commit_write(inode,page,0,offset);
2127 done:
2128 		kunmap(page);
2129 		UnlockPage(page);
2130 		return err;
2131 	}
2132 	ClearPageUptodate(page);
2133 	goto done;
2134 }
2135 
2136 /*
2137  * Commence writeout of all the buffers against a page.  The
2138  * page must be locked.   Returns zero on success or a negative
2139  * errno.
2140  */
writeout_one_page(struct page * page)2141 int writeout_one_page(struct page *page)
2142 {
2143 	struct buffer_head *bh, *head = page->buffers;
2144 
2145 	if (!PageLocked(page))
2146 		BUG();
2147 	bh = head;
2148 	do {
2149 		if (buffer_locked(bh) || !buffer_dirty(bh) || !buffer_uptodate(bh))
2150 			continue;
2151 
2152 		bh->b_flushtime = jiffies;
2153 		ll_rw_block(WRITE, 1, &bh);
2154 	} while ((bh = bh->b_this_page) != head);
2155 	return 0;
2156 }
2157 EXPORT_SYMBOL(writeout_one_page);
2158 
2159 /*
2160  * Wait for completion of I/O of all buffers against a page.  The page
2161  * must be locked.  Returns zero on success or a negative errno.
2162  */
waitfor_one_page(struct page * page)2163 int waitfor_one_page(struct page *page)
2164 {
2165 	int error = 0;
2166 	struct buffer_head *bh, *head = page->buffers;
2167 
2168 	bh = head;
2169 	do {
2170 		wait_on_buffer(bh);
2171 		if (buffer_req(bh) && !buffer_uptodate(bh))
2172 			error = -EIO;
2173 	} while ((bh = bh->b_this_page) != head);
2174 	return error;
2175 }
2176 EXPORT_SYMBOL(waitfor_one_page);
2177 
generic_block_bmap(struct address_space * mapping,long block,get_block_t * get_block)2178 int generic_block_bmap(struct address_space *mapping, long block, get_block_t *get_block)
2179 {
2180 	struct buffer_head tmp;
2181 	struct inode *inode = mapping->host;
2182 	tmp.b_state = 0;
2183 	tmp.b_blocknr = 0;
2184 	get_block(inode, block, &tmp, 0);
2185 	return tmp.b_blocknr;
2186 }
2187 
generic_direct_IO(int rw,struct inode * inode,struct kiobuf * iobuf,unsigned long blocknr,int blocksize,get_block_t * get_block)2188 int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block)
2189 {
2190 	int i, nr_blocks, retval;
2191 	unsigned long * blocks = iobuf->blocks;
2192 	int length;
2193 	int beyond_eof = 0;
2194 
2195 	length = iobuf->length;
2196 	nr_blocks = length / blocksize;
2197 	/* build the blocklist */
2198 	for (i = 0; i < nr_blocks; i++, blocknr++) {
2199 		struct buffer_head bh;
2200 
2201 		bh.b_state = 0;
2202 		bh.b_dev = inode->i_dev;
2203 		bh.b_size = blocksize;
2204 		bh.b_page = NULL;
2205 
2206 		if (((loff_t) blocknr) * blocksize >= inode->i_size)
2207 			beyond_eof = 1;
2208 
2209 		/* Only allow get_block to create new blocks if we are safely
2210 		   beyond EOF.  O_DIRECT is unsafe inside sparse files. */
2211 		retval = get_block(inode, blocknr, &bh,
2212 				   ((rw != READ) && beyond_eof));
2213 
2214 		if (retval) {
2215 			if (!i)
2216 				/* report error to userspace */
2217 				goto out;
2218 			else
2219 				/* do short I/O until 'i' */
2220 				break;
2221 		}
2222 
2223 		if (rw == READ) {
2224 			if (buffer_new(&bh))
2225 				BUG();
2226 			if (!buffer_mapped(&bh)) {
2227 				/* there was an hole in the filesystem */
2228 				blocks[i] = -1UL;
2229 				continue;
2230 			}
2231 		} else {
2232 			if (buffer_new(&bh))
2233 				unmap_underlying_metadata(&bh);
2234 			if (!buffer_mapped(&bh))
2235 				/* upper layers need to pass the error on or
2236 				 * fall back to buffered IO. */
2237 				return -ENOTBLK;
2238 		}
2239 		blocks[i] = bh.b_blocknr;
2240 	}
2241 
2242 	/* patch length to handle short I/O */
2243 	iobuf->length = i * blocksize;
2244 	if (!beyond_eof)
2245 		up(&inode->i_sem);
2246 	retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize);
2247 	if (!beyond_eof)
2248 		down(&inode->i_sem);
2249 	/* restore orig length */
2250 	iobuf->length = length;
2251  out:
2252 
2253 	return retval;
2254 }
2255 
2256 /*
2257  * IO completion routine for a buffer_head being used for kiobuf IO: we
2258  * can't dispatch the kiobuf callback until io_count reaches 0.
2259  */
2260 
end_buffer_io_kiobuf(struct buffer_head * bh,int uptodate)2261 static void end_buffer_io_kiobuf(struct buffer_head *bh, int uptodate)
2262 {
2263 	struct kiobuf *kiobuf;
2264 
2265 	mark_buffer_uptodate(bh, uptodate);
2266 
2267 	kiobuf = bh->b_private;
2268 	end_kio_request(kiobuf, uptodate);
2269 	unlock_buffer(bh);
2270 }
2271 
2272 /*
2273  * For brw_kiovec: submit a set of buffer_head temporary IOs and wait
2274  * for them to complete.  Clean up the buffer_heads afterwards.
2275  */
2276 
wait_kio(int rw,int nr,struct buffer_head * bh[],int size)2277 static int wait_kio(int rw, int nr, struct buffer_head *bh[], int size)
2278 {
2279 	int iosize, err;
2280 	int i;
2281 	struct buffer_head *tmp;
2282 
2283 	iosize = 0;
2284 	err = 0;
2285 
2286 	for (i = nr; --i >= 0; ) {
2287 		iosize += size;
2288 		tmp = bh[i];
2289 		wait_on_buffer(tmp);
2290 
2291 		if (!buffer_uptodate(tmp)) {
2292 			/* We are traversing bh'es in reverse order so
2293                            clearing iosize on error calculates the
2294                            amount of IO before the first error. */
2295 			iosize = 0;
2296 			err = -EIO;
2297 		}
2298 	}
2299 
2300 	if (iosize)
2301 		return iosize;
2302 	return err;
2303 }
2304 
2305 /*
2306  * Start I/O on a physical range of kernel memory, defined by a vector
2307  * of kiobuf structs (much like a user-space iovec list).
2308  *
2309  * The kiobuf must already be locked for IO.  IO is submitted
2310  * asynchronously: you need to check page->locked and page->uptodate.
2311  *
2312  * It is up to the caller to make sure that there are enough blocks
2313  * passed in to completely map the iobufs to disk.
2314  */
2315 
brw_kiovec(int rw,int nr,struct kiobuf * iovec[],kdev_t dev,unsigned long b[],int size)2316 int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
2317 	       kdev_t dev, unsigned long b[], int size)
2318 {
2319 	int		err;
2320 	int		length;
2321 	int		transferred;
2322 	int		i;
2323 	int		bufind;
2324 	int		pageind;
2325 	int		bhind;
2326 	int		offset;
2327 	unsigned long	blocknr;
2328 	struct kiobuf *	iobuf = NULL;
2329 	struct page *	map;
2330 	struct buffer_head *tmp, **bhs = NULL;
2331 
2332 	if (!nr)
2333 		return 0;
2334 
2335 	/*
2336 	 * First, do some alignment and validity checks
2337 	 */
2338 	for (i = 0; i < nr; i++) {
2339 		iobuf = iovec[i];
2340 		if ((iobuf->offset & (size-1)) ||
2341 		    (iobuf->length & (size-1)))
2342 			return -EINVAL;
2343 		if (!iobuf->nr_pages)
2344 			panic("brw_kiovec: iobuf not initialised");
2345 	}
2346 
2347 	/*
2348 	 * OK to walk down the iovec doing page IO on each page we find.
2349 	 */
2350 	bufind = bhind = transferred = err = 0;
2351 	for (i = 0; i < nr; i++) {
2352 		iobuf = iovec[i];
2353 		offset = iobuf->offset;
2354 		length = iobuf->length;
2355 		iobuf->errno = 0;
2356 		if (!bhs)
2357 			bhs = iobuf->bh;
2358 
2359 		for (pageind = 0; pageind < iobuf->nr_pages; pageind++) {
2360 			map  = iobuf->maplist[pageind];
2361 			if (!map) {
2362 				err = -EFAULT;
2363 				goto finished;
2364 			}
2365 
2366 			while (length > 0) {
2367 				blocknr = b[bufind++];
2368 				if (blocknr == -1UL) {
2369 					if (rw == READ) {
2370 						/* there was an hole in the filesystem */
2371 						memset(kmap(map) + offset, 0, size);
2372 						flush_dcache_page(map);
2373 						kunmap(map);
2374 
2375 						transferred += size;
2376 						goto skip_block;
2377 					} else
2378 						BUG();
2379 				}
2380 				tmp = bhs[bhind++];
2381 
2382 				tmp->b_size = size;
2383 				set_bh_page(tmp, map, offset);
2384 				tmp->b_this_page = tmp;
2385 
2386 				init_buffer(tmp, end_buffer_io_kiobuf, iobuf);
2387 				tmp->b_dev = dev;
2388 				tmp->b_blocknr = blocknr;
2389 				tmp->b_state = (1 << BH_Mapped) | (1 << BH_Lock) | (1 << BH_Req);
2390 
2391 				if (rw == WRITE) {
2392 					set_bit(BH_Uptodate, &tmp->b_state);
2393 					clear_bit(BH_Dirty, &tmp->b_state);
2394 				} else
2395 					set_bit(BH_Uptodate, &tmp->b_state);
2396 
2397 				atomic_inc(&iobuf->io_count);
2398 				submit_bh(rw, tmp);
2399 				/*
2400 				 * Wait for IO if we have got too much
2401 				 */
2402 				if (bhind >= KIO_MAX_SECTORS) {
2403 					kiobuf_wait_for_io(iobuf); /* wake-one */
2404 					err = wait_kio(rw, bhind, bhs, size);
2405 					if (err >= 0)
2406 						transferred += err;
2407 					else
2408 						goto finished;
2409 					bhind = 0;
2410 				}
2411 
2412 			skip_block:
2413 				length -= size;
2414 				offset += size;
2415 
2416 				if (offset >= PAGE_SIZE) {
2417 					offset = 0;
2418 					break;
2419 				}
2420 			} /* End of block loop */
2421 		} /* End of page loop */
2422 	} /* End of iovec loop */
2423 
2424 	/* Is there any IO still left to submit? */
2425 	if (bhind) {
2426 		kiobuf_wait_for_io(iobuf); /* wake-one */
2427 		err = wait_kio(rw, bhind, bhs, size);
2428 		if (err >= 0)
2429 			transferred += err;
2430 		else
2431 			goto finished;
2432 	}
2433 
2434  finished:
2435 	if (transferred)
2436 		return transferred;
2437 	return err;
2438 }
2439 
2440 /*
2441  * Start I/O on a page.
2442  * This function expects the page to be locked and may return
2443  * before I/O is complete. You then have to check page->locked
2444  * and page->uptodate.
2445  *
2446  * brw_page() is SMP-safe, although it's being called with the
2447  * kernel lock held - but the code is ready.
2448  *
2449  * FIXME: we need a swapper_inode->get_block function to remove
2450  *        some of the bmap kludges and interface ugliness here.
2451  */
brw_page(int rw,struct page * page,kdev_t dev,int b[],int size)2452 int brw_page(int rw, struct page *page, kdev_t dev, int b[], int size)
2453 {
2454 	struct buffer_head *head, *bh;
2455 
2456 	if (!PageLocked(page))
2457 		panic("brw_page: page not locked for I/O");
2458 
2459 	if (!page->buffers)
2460 		create_empty_buffers(page, dev, size);
2461 	head = bh = page->buffers;
2462 
2463 	/* Stage 1: lock all the buffers */
2464 	do {
2465 		lock_buffer(bh);
2466 		bh->b_blocknr = *(b++);
2467 		set_bit(BH_Mapped, &bh->b_state);
2468 		set_buffer_async_io(bh);
2469 		bh = bh->b_this_page;
2470 	} while (bh != head);
2471 
2472 	/* Stage 2: start the IO */
2473 	do {
2474 		struct buffer_head *next = bh->b_this_page;
2475 		submit_bh(rw, bh);
2476 		bh = next;
2477 	} while (bh != head);
2478 	wakeup_page_waiters(page);
2479 	return 0;
2480 }
2481 
block_symlink(struct inode * inode,const char * symname,int len)2482 int block_symlink(struct inode *inode, const char *symname, int len)
2483 {
2484 	struct address_space *mapping = inode->i_mapping;
2485 	struct page *page = grab_cache_page(mapping, 0);
2486 	int err = -ENOMEM;
2487 	char *kaddr;
2488 
2489 	if (!page)
2490 		goto fail;
2491 	err = mapping->a_ops->prepare_write(NULL, page, 0, len-1);
2492 	if (err)
2493 		goto fail_map;
2494 	kaddr = page_address(page);
2495 	memcpy(kaddr, symname, len-1);
2496 	mapping->a_ops->commit_write(NULL, page, 0, len-1);
2497 	/*
2498 	 * Notice that we are _not_ going to block here - end of page is
2499 	 * unmapped, so this will only try to map the rest of page, see
2500 	 * that it is unmapped (typically even will not look into inode -
2501 	 * ->i_size will be enough for everything) and zero it out.
2502 	 * OTOH it's obviously correct and should make the page up-to-date.
2503 	 */
2504 	err = mapping->a_ops->readpage(NULL, page);
2505 	wait_on_page(page);
2506 	page_cache_release(page);
2507 	if (err < 0)
2508 		goto fail;
2509 	mark_inode_dirty(inode);
2510 	return 0;
2511 fail_map:
2512 	UnlockPage(page);
2513 	page_cache_release(page);
2514 fail:
2515 	return err;
2516 }
2517 
link_dev_buffers(struct page * page,struct buffer_head * head)2518 static inline void link_dev_buffers(struct page * page, struct buffer_head *head)
2519 {
2520 	struct buffer_head *bh, *tail;
2521 
2522 	bh = head;
2523 	do {
2524 		tail = bh;
2525 		bh = bh->b_this_page;
2526 	} while (bh);
2527 	tail->b_this_page = head;
2528 	page->buffers = head;
2529 	page_cache_get(page);
2530 }
2531 
2532 /*
2533  * Create the page-cache page that contains the requested block
2534  */
grow_dev_page(struct block_device * bdev,unsigned long index,int size)2535 static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size)
2536 {
2537 	struct page * page;
2538 	struct buffer_head *bh;
2539 
2540 	page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS);
2541 	if (!page)
2542 		return NULL;
2543 
2544 	if (!PageLocked(page))
2545 		BUG();
2546 
2547 	bh = page->buffers;
2548 	if (bh) {
2549 		if (bh->b_size == size)
2550 			return page;
2551 		if (!try_to_free_buffers(page, GFP_NOFS))
2552 			goto failed;
2553 	}
2554 
2555 	bh = create_buffers(page, size, 0);
2556 	if (!bh)
2557 		goto failed;
2558 	link_dev_buffers(page, bh);
2559 	return page;
2560 
2561 failed:
2562 	UnlockPage(page);
2563 	page_cache_release(page);
2564 	return NULL;
2565 }
2566 
hash_page_buffers(struct page * page,kdev_t dev,int block,int size)2567 static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size)
2568 {
2569 	struct buffer_head *head = page->buffers;
2570 	struct buffer_head *bh = head;
2571 	unsigned int uptodate;
2572 
2573 	uptodate = 1 << BH_Mapped;
2574 	if (Page_Uptodate(page))
2575 		uptodate |= 1 << BH_Uptodate;
2576 
2577 	write_lock(&hash_table_lock);
2578 	do {
2579 		if (!(bh->b_state & (1 << BH_Mapped))) {
2580 			init_buffer(bh, NULL, NULL);
2581 			bh->b_dev = dev;
2582 			bh->b_blocknr = block;
2583 			bh->b_state = uptodate;
2584 		}
2585 
2586 		/* Insert the buffer into the hash lists if necessary */
2587 		if (!bh->b_pprev)
2588 			__insert_into_hash_list(bh);
2589 
2590 		block++;
2591 		bh = bh->b_this_page;
2592 	} while (bh != head);
2593 	write_unlock(&hash_table_lock);
2594 }
2595 
2596 /*
2597  * Try to increase the number of buffers available: the size argument
2598  * is used to determine what kind of buffers we want.
2599  */
grow_buffers(kdev_t dev,unsigned long block,int size)2600 static int grow_buffers(kdev_t dev, unsigned long block, int size)
2601 {
2602 	struct page * page;
2603 	struct block_device *bdev;
2604 	unsigned long index;
2605 	int sizebits;
2606 
2607 	/* Size must be multiple of hard sectorsize */
2608 	if (size & (get_hardsect_size(dev)-1))
2609 		BUG();
2610 	/* Size must be within 512 bytes and PAGE_SIZE */
2611 	if (size < 512 || size > PAGE_SIZE)
2612 		BUG();
2613 
2614 	sizebits = -1;
2615 	do {
2616 		sizebits++;
2617 	} while ((size << sizebits) < PAGE_SIZE);
2618 
2619 	index = block >> sizebits;
2620 	block = index << sizebits;
2621 
2622 	bdev = bdget(kdev_t_to_nr(dev));
2623 	if (!bdev) {
2624 		printk("No block device for %s\n", kdevname(dev));
2625 		BUG();
2626 	}
2627 
2628 	/* Create a page with the proper size buffers.. */
2629 	page = grow_dev_page(bdev, index, size);
2630 
2631 	/* This is "wrong" - talk to Al Viro */
2632 	atomic_dec(&bdev->bd_count);
2633 	if (!page)
2634 		return 0;
2635 
2636 	/* Hash in the buffers on the hash list */
2637 	hash_page_buffers(page, dev, block, size);
2638 	UnlockPage(page);
2639 	page_cache_release(page);
2640 
2641 	/* We hashed up this page, so increment buffermem */
2642 	atomic_inc(&buffermem_pages);
2643 	return 1;
2644 }
2645 
2646 /*
2647  * The first time the VM inspects a page which has locked buffers, it
2648  * will just mark it as needing waiting upon on the scan of the page LRU.
2649  * BH_Wait_IO is used for this.
2650  *
2651  * The second time the VM visits the page, if it still has locked
2652  * buffers, it is time to start writing them out.  (BH_Wait_IO was set).
2653  *
2654  * The third time the VM visits the page, if the I/O hasn't completed
2655  * then it's time to wait upon writeout.  BH_Lock and BH_Launder are
2656  * used for this.
2657  *
2658  * There is also the case of buffers which were locked by someone else
2659  * - write(2) callers, bdflush, etc.  There can be a huge number of these
2660  * and we don't want to just skip them all and fail the page allocation.
2661  * We want to be able to wait on these buffers as well.
2662  *
2663  * The BH_Launder bit is set in submit_bh() to indicate that I/O is
2664  * underway against the buffer, doesn't matter who started it - we know
2665  * that the buffer will eventually come unlocked, and so it's safe to
2666  * wait on it.
2667  *
2668  * The caller holds the page lock and the caller will free this page
2669  * into current->local_page, so by waiting on the page's buffers the
2670  * caller is guaranteed to obtain this page.
2671  *
2672  * sync_page_buffers() will sort-of return true if all the buffers
2673  * against this page are freeable, so try_to_free_buffers() should
2674  * try to free the page's buffers a second time.  This is a bit
2675  * broken for blocksize < PAGE_CACHE_SIZE, but not very importantly.
2676  */
sync_page_buffers(struct buffer_head * head)2677 static int sync_page_buffers(struct buffer_head *head)
2678 {
2679 	struct buffer_head * bh = head;
2680 	int tryagain = 1;
2681 
2682 	do {
2683 		if (!buffer_dirty(bh) && !buffer_locked(bh))
2684 			continue;
2685 
2686 		/* Don't start IO first time around.. */
2687 		if (!test_and_set_bit(BH_Wait_IO, &bh->b_state)) {
2688 			tryagain = 0;
2689 			continue;
2690 		}
2691 
2692 		/* Second time through we start actively writing out.. */
2693 		if (test_and_set_bit(BH_Lock, &bh->b_state)) {
2694 			if (unlikely(!buffer_launder(bh))) {
2695 				tryagain = 0;
2696 				continue;
2697 			}
2698 			wait_on_buffer(bh);
2699 			tryagain = 1;
2700 			continue;
2701 		}
2702 
2703 		if (!atomic_set_buffer_clean(bh)) {
2704 			unlock_buffer(bh);
2705 			continue;
2706 		}
2707 
2708 		__mark_buffer_clean(bh);
2709 		get_bh(bh);
2710 		bh->b_end_io = end_buffer_io_sync;
2711 		submit_bh(WRITE, bh);
2712 		tryagain = 0;
2713 	} while ((bh = bh->b_this_page) != head);
2714 
2715 	return tryagain;
2716 }
2717 
2718 /*
2719  * Can the buffer be thrown out?
2720  */
2721 #define BUFFER_BUSY_BITS	((1<<BH_Dirty) | (1<<BH_Lock))
2722 #define buffer_busy(bh)		(atomic_read(&(bh)->b_count) | ((bh)->b_state & BUFFER_BUSY_BITS))
2723 
2724 /*
2725  * try_to_free_buffers() checks if all the buffers on this particular page
2726  * are unused, and free's the page if so.
2727  *
2728  * Wake up bdflush() if this fails - if we're running low on memory due
2729  * to dirty buffers, we need to flush them out as quickly as possible.
2730  *
2731  * NOTE: There are quite a number of ways that threads of control can
2732  *       obtain a reference to a buffer head within a page.  So we must
2733  *	 lock out all of these paths to cleanly toss the page.
2734  */
try_to_free_buffers(struct page * page,unsigned int gfp_mask)2735 int fastcall try_to_free_buffers(struct page * page, unsigned int gfp_mask)
2736 {
2737 	struct buffer_head * tmp, * bh = page->buffers;
2738 
2739 cleaned_buffers_try_again:
2740 	spin_lock(&lru_list_lock);
2741 	write_lock(&hash_table_lock);
2742 	tmp = bh;
2743 	do {
2744 		if (buffer_busy(tmp))
2745 			goto busy_buffer_page;
2746 		tmp = tmp->b_this_page;
2747 	} while (tmp != bh);
2748 
2749 	spin_lock(&unused_list_lock);
2750 	tmp = bh;
2751 
2752 	/* if this buffer was hashed, this page counts as buffermem */
2753 	if (bh->b_pprev)
2754 		atomic_dec(&buffermem_pages);
2755 	do {
2756 		struct buffer_head * p = tmp;
2757 		tmp = tmp->b_this_page;
2758 
2759 		if (p->b_dev == B_FREE) BUG();
2760 
2761 		remove_inode_queue(p);
2762 		__remove_from_queues(p);
2763 		__put_unused_buffer_head(p);
2764 	} while (tmp != bh);
2765 	spin_unlock(&unused_list_lock);
2766 
2767 	/* Wake up anyone waiting for buffer heads */
2768 	wake_up(&buffer_wait);
2769 
2770 	/* And free the page */
2771 	page->buffers = NULL;
2772 	page_cache_release(page);
2773 	write_unlock(&hash_table_lock);
2774 	spin_unlock(&lru_list_lock);
2775 	return 1;
2776 
2777 busy_buffer_page:
2778 	/* Uhhuh, start writeback so that we don't end up with all dirty pages */
2779 	write_unlock(&hash_table_lock);
2780 	spin_unlock(&lru_list_lock);
2781 	gfp_mask = pf_gfp_mask(gfp_mask);
2782 	if (gfp_mask & __GFP_IO) {
2783 		if ((gfp_mask & __GFP_HIGHIO) || !PageHighMem(page)) {
2784 			if (sync_page_buffers(bh)) {
2785 				/* no IO or waiting next time */
2786 				gfp_mask = 0;
2787 				goto cleaned_buffers_try_again;
2788 			}
2789 		}
2790 	}
2791 	if (balance_dirty_state() >= 0)
2792 		wakeup_bdflush();
2793 	return 0;
2794 }
2795 EXPORT_SYMBOL(try_to_free_buffers);
2796 
2797 /* ================== Debugging =================== */
2798 
show_buffers(void)2799 void show_buffers(void)
2800 {
2801 #ifdef CONFIG_SMP
2802 	struct buffer_head * bh;
2803 	int delalloc = 0, found = 0, locked = 0, dirty = 0, used = 0, lastused = 0;
2804 	int nlist;
2805 	static char *buf_types[NR_LIST] = { "CLEAN", "LOCKED", "DIRTY", };
2806 #endif
2807 
2808 	printk("Buffer memory:   %6dkB\n",
2809 		atomic_read(&buffermem_pages) << (PAGE_SHIFT-10));
2810 
2811 	printk("Cache memory:   %6ldkB\n",
2812 		(page_cache_size - atomic_read(&buffermem_pages)) << (PAGE_SHIFT-10));
2813 
2814 #ifdef CONFIG_SMP /* trylock does nothing on UP and so we could deadlock */
2815 	if (!spin_trylock(&lru_list_lock))
2816 		return;
2817 	for(nlist = 0; nlist < NR_LIST; nlist++) {
2818 		delalloc = found = locked = dirty = used = lastused = 0;
2819 		bh = lru_list[nlist];
2820 		if(!bh) continue;
2821 
2822 		do {
2823 			found++;
2824 			if (buffer_locked(bh))
2825 				locked++;
2826 			if (buffer_dirty(bh))
2827 				dirty++;
2828 			if (buffer_delay(bh))
2829 				delalloc++;
2830 			if (atomic_read(&bh->b_count))
2831 				used++, lastused = found;
2832 			bh = bh->b_next_free;
2833 		} while (bh != lru_list[nlist]);
2834 		{
2835 			int tmp = nr_buffers_type[nlist];
2836 			if (found != tmp)
2837 				printk("%9s: BUG -> found %d, reported %d\n",
2838 				       buf_types[nlist], found, tmp);
2839 		}
2840 		printk("%9s: %d buffers, %lu kbyte, %d used (last=%d), "
2841 		       "%d locked, %d dirty, %d delay\n",
2842 		       buf_types[nlist], found, size_buffers_type[nlist]>>(10-9),
2843 		       used, lastused, locked, dirty, delalloc);
2844 	}
2845 	spin_unlock(&lru_list_lock);
2846 #endif
2847 }
2848 
2849 /* ===================== Init ======================= */
2850 
2851 /*
2852  * allocate the hash table and init the free list
2853  * Use gfp() for the hash table to decrease TLB misses, use
2854  * SLAB cache for buffer heads.
2855  */
buffer_init(unsigned long mempages)2856 void __init buffer_init(unsigned long mempages)
2857 {
2858 	int order, i;
2859 	unsigned int nr_hash;
2860 
2861 	/* The buffer cache hash table is less important these days,
2862 	 * trim it a bit.
2863 	 */
2864 	mempages >>= 14;
2865 
2866 	mempages *= sizeof(struct buffer_head *);
2867 
2868 	for (order = 0; (1 << order) < mempages; order++)
2869 		;
2870 
2871 	/* try to allocate something until we get it or we're asking
2872 	   for something that is really too small */
2873 
2874 	do {
2875 		unsigned long tmp;
2876 
2877 		nr_hash = (PAGE_SIZE << order) / sizeof(struct buffer_head *);
2878 		bh_hash_mask = (nr_hash - 1);
2879 
2880 		tmp = nr_hash;
2881 		bh_hash_shift = 0;
2882 		while((tmp >>= 1UL) != 0UL)
2883 			bh_hash_shift++;
2884 
2885 		hash_table = (struct buffer_head **)
2886 		    __get_free_pages(GFP_ATOMIC, order);
2887 	} while (hash_table == NULL && --order > 0);
2888 	printk(KERN_INFO "Buffer cache hash table entries: %d (order: %d, %ld bytes)\n",
2889 	       nr_hash, order, (PAGE_SIZE << order));
2890 
2891 	if (!hash_table)
2892 		panic("Failed to allocate buffer hash table\n");
2893 
2894 	/* Setup hash chains. */
2895 	for(i = 0; i < nr_hash; i++)
2896 		hash_table[i] = NULL;
2897 
2898 	/* Setup lru lists. */
2899 	for(i = 0; i < NR_LIST; i++)
2900 		lru_list[i] = NULL;
2901 
2902 }
2903 
2904 
2905 /* ====================== bdflush support =================== */
2906 
2907 /* This is a simple kernel daemon, whose job it is to provide a dynamic
2908  * response to dirty buffers.  Once this process is activated, we write back
2909  * a limited number of buffers to the disks and then go back to sleep again.
2910  */
2911 
2912 DECLARE_WAIT_QUEUE_HEAD(bdflush_wait);
2913 
wakeup_bdflush(void)2914 void wakeup_bdflush(void)
2915 {
2916 	wake_up_interruptible(&bdflush_wait);
2917 }
2918 
wakeup_kupdate(void)2919 void wakeup_kupdate(void)
2920 {
2921 	if (waitqueue_active(&kupdate_wait))
2922 		wake_up(&kupdate_wait);
2923 }
2924 
2925 /*
2926  * Here we attempt to write back old buffers.  We also try to flush inodes
2927  * and supers as well, since this function is essentially "update", and
2928  * otherwise there would be no way of ensuring that these quantities ever
2929  * get written back.  Ideally, we would have a timestamp on the inodes
2930  * and superblocks so that we could write back only the old ones as well
2931  */
2932 
sync_old_buffers(void)2933 static int sync_old_buffers(void)
2934 {
2935 	lock_kernel();
2936 	sync_unlocked_inodes();
2937 	sync_supers(0, 0);
2938 	unlock_kernel();
2939 
2940 	for (;;) {
2941 		struct buffer_head *bh;
2942 
2943 		spin_lock(&lru_list_lock);
2944 		bh = lru_list[BUF_DIRTY];
2945 		if (!bh)
2946 			break;
2947 		if (time_before(jiffies, bh->b_flushtime) && !laptop_mode)
2948 			break;
2949 		if (write_some_buffers(NODEV))
2950 			continue;
2951 		return 0;
2952 	}
2953 	spin_unlock(&lru_list_lock);
2954 	return 0;
2955 }
2956 
block_sync_page(struct page * page)2957 int block_sync_page(struct page *page)
2958 {
2959 	run_task_queue(&tq_disk);
2960 	return 0;
2961 }
2962 
2963 /* This is the interface to bdflush.  As we get more sophisticated, we can
2964  * pass tuning parameters to this "process", to adjust how it behaves.
2965  * We would want to verify each parameter, however, to make sure that it
2966  * is reasonable. */
2967 
sys_bdflush(int func,long data)2968 asmlinkage long sys_bdflush(int func, long data)
2969 {
2970 	if (!capable(CAP_SYS_ADMIN))
2971 		return -EPERM;
2972 
2973 	if (func == 1) {
2974 		/* do_exit directly and let kupdate to do its work alone. */
2975 		do_exit(0);
2976 #if 0 /* left here as it's the only example of lazy-mm-stuff used from
2977 	 a syscall that doesn't care about the current mm context. */
2978 		int error;
2979 		struct mm_struct *user_mm;
2980 
2981 		/*
2982 		 * bdflush will spend all of it's time in kernel-space,
2983 		 * without touching user-space, so we can switch it into
2984 		 * 'lazy TLB mode' to reduce the cost of context-switches
2985 		 * to and from bdflush.
2986 		 */
2987 		user_mm = start_lazy_tlb();
2988 		error = sync_old_buffers();
2989 		end_lazy_tlb(user_mm);
2990 		return error;
2991 #endif
2992 	}
2993 
2994 	/* Basically func 1 means read param 1, 2 means write param 1, etc */
2995 	if (func >= 2) {
2996 		int i = (func-2) >> 1;
2997 		if (i >= 0 && i < N_PARAM) {
2998 			if ((func & 1) == 0)
2999 				return put_user(bdf_prm.data[i], (int*)data);
3000 
3001 			if (data >= bdflush_min[i] && data <= bdflush_max[i]) {
3002 				bdf_prm.data[i] = data;
3003 				return 0;
3004 			}
3005 		}
3006 		return -EINVAL;
3007 	}
3008 
3009 	/* Having func 0 used to launch the actual bdflush and then never
3010 	 * return (unless explicitly killed). We return zero here to
3011 	 * remain semi-compatible with present update(8) programs.
3012 	 */
3013 	return 0;
3014 }
3015 
3016 /*
3017  * This is the actual bdflush daemon itself. It used to be started from
3018  * the syscall above, but now we launch it ourselves internally with
3019  * kernel_thread(...)  directly after the first thread in init/main.c
3020  */
bdflush(void * startup)3021 int bdflush(void *startup)
3022 {
3023 	struct task_struct *tsk = current;
3024 
3025 	/*
3026 	 *	We have a bare-bones task_struct, and really should fill
3027 	 *	in a few more things so "top" and /proc/2/{exe,root,cwd}
3028 	 *	display semi-sane things. Not real crucial though...
3029 	 */
3030 
3031 	tsk->session = 1;
3032 	tsk->pgrp = 1;
3033 	strcpy(tsk->comm, "bdflush");
3034 
3035 	/* avoid getting signals */
3036 	spin_lock_irq(&tsk->sigmask_lock);
3037 	flush_signals(tsk);
3038 	sigfillset(&tsk->blocked);
3039 	recalc_sigpending(tsk);
3040 	spin_unlock_irq(&tsk->sigmask_lock);
3041 
3042 	complete((struct completion *)startup);
3043 
3044 	/*
3045 	 * FIXME: The ndirty logic here is wrong.  It's supposed to
3046 	 * send bdflush back to sleep after writing ndirty buffers.
3047 	 * In fact, the test is wrong so bdflush will in fact
3048 	 * sleep when bdflush_stop() returns true.
3049 	 *
3050 	 * FIXME: If it proves useful to implement ndirty properly,
3051 	 * then perhaps the value of ndirty should be scaled by the
3052 	 * amount of memory in the machine.
3053 	 */
3054 	for (;;) {
3055 		int ndirty = bdf_prm.b_un.ndirty;
3056 
3057 		CHECK_EMERGENCY_SYNC
3058 
3059 		while (ndirty > 0) {
3060 			spin_lock(&lru_list_lock);
3061 			if (!write_some_buffers(NODEV))
3062 				break;
3063 			ndirty -= NRSYNC;
3064 		}
3065 		if (ndirty > 0 || bdflush_stop())
3066 			interruptible_sleep_on(&bdflush_wait);
3067 	}
3068 }
3069 
3070 /*
3071  * This is the kernel update daemon. It was used to live in userspace
3072  * but since it's need to run safely we want it unkillable by mistake.
3073  * You don't need to change your userspace configuration since
3074  * the userspace `update` will do_exit(0) at the first sys_bdflush().
3075  */
kupdate(void * startup)3076 int kupdate(void *startup)
3077 {
3078 	struct task_struct * tsk = current;
3079 	int interval;
3080 
3081 	tsk->session = 1;
3082 	tsk->pgrp = 1;
3083 	strcpy(tsk->comm, "kupdated");
3084 
3085 	/* sigstop and sigcont will stop and wakeup kupdate */
3086 	spin_lock_irq(&tsk->sigmask_lock);
3087 	sigfillset(&tsk->blocked);
3088 	siginitsetinv(&current->blocked, sigmask(SIGCONT) | sigmask(SIGSTOP));
3089 	recalc_sigpending(tsk);
3090 	spin_unlock_irq(&tsk->sigmask_lock);
3091 
3092 	complete((struct completion *)startup);
3093 
3094 	for (;;) {
3095 		DECLARE_WAITQUEUE(wait, tsk);
3096 
3097 		add_wait_queue(&kupdate_wait, &wait);
3098 
3099 		/* update interval */
3100 		interval = bdf_prm.b_un.interval;
3101 		if (interval) {
3102 			tsk->state = TASK_INTERRUPTIBLE;
3103 			schedule_timeout(interval);
3104 		} else {
3105 			tsk->state = TASK_STOPPED;
3106 			schedule(); /* wait for SIGCONT */
3107 		}
3108 		remove_wait_queue(&kupdate_wait, &wait);
3109 		/* check for sigstop */
3110 		if (signal_pending(tsk)) {
3111 			int sig, stopped = 0;
3112 			struct siginfo info;
3113 
3114 			spin_lock_irq(&tsk->sigmask_lock);
3115 			sig = dequeue_signal(&current->blocked, &info);
3116 			if (sig == SIGSTOP)
3117 				stopped = 1;
3118 			spin_unlock_irq(&tsk->sigmask_lock);
3119 			if (stopped) {
3120 				tsk->state = TASK_STOPPED;
3121 				schedule(); /* wait for SIGCONT */
3122 			}
3123 		}
3124 #ifdef DEBUG
3125 		printk(KERN_DEBUG "kupdate() activated...\n");
3126 #endif
3127 		sync_old_buffers();
3128 		if (laptop_mode)
3129 			fsync_dev(NODEV);
3130 		run_task_queue(&tq_disk);
3131 	}
3132 }
3133 
bdflush_init(void)3134 static int __init bdflush_init(void)
3135 {
3136 	static struct completion startup __initdata = COMPLETION_INITIALIZER(startup);
3137 
3138 	kernel_thread(bdflush, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3139 	wait_for_completion(&startup);
3140 	kernel_thread(kupdate, &startup, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
3141 	wait_for_completion(&startup);
3142 	return 0;
3143 }
3144 
3145 module_init(bdflush_init)
3146 
3147