1 /*
2  *  linux/fs/buffer.c
3  *
4  *  Copyright (C) 1991, 1992, 2002  Linus Torvalds
5  */
6 
7 /*
8  * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9  *
10  * Removed a lot of unnecessary code and simplified things now that
11  * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12  *
13  * Speed up hash, lru, and free list operations.  Use gfp() for allocating
14  * hash table, use SLAB cache for buffer heads. SMP threading.  -DaveM
15  *
16  * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17  *
18  * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19  */
20 
21 #include <linux/kernel.h>
22 #include <linux/syscalls.h>
23 #include <linux/fs.h>
24 #include <linux/mm.h>
25 #include <linux/percpu.h>
26 #include <linux/slab.h>
27 #include <linux/capability.h>
28 #include <linux/blkdev.h>
29 #include <linux/file.h>
30 #include <linux/quotaops.h>
31 #include <linux/highmem.h>
32 #include <linux/module.h>
33 #include <linux/writeback.h>
34 #include <linux/hash.h>
35 #include <linux/suspend.h>
36 #include <linux/buffer_head.h>
37 #include <linux/task_io_accounting_ops.h>
38 #include <linux/bio.h>
39 #include <linux/notifier.h>
40 #include <linux/cpu.h>
41 #include <linux/bitops.h>
42 #include <linux/mpage.h>
43 #include <linux/bit_spinlock.h>
44 
45 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
46 
47 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48 
49 inline void
init_buffer(struct buffer_head * bh,bh_end_io_t * handler,void * private)50 init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51 {
52 	bh->b_end_io = handler;
53 	bh->b_private = private;
54 }
55 EXPORT_SYMBOL(init_buffer);
56 
sleep_on_buffer(void * word)57 static int sleep_on_buffer(void *word)
58 {
59 	io_schedule();
60 	return 0;
61 }
62 
__lock_buffer(struct buffer_head * bh)63 void __lock_buffer(struct buffer_head *bh)
64 {
65 	wait_on_bit_lock(&bh->b_state, BH_Lock, sleep_on_buffer,
66 							TASK_UNINTERRUPTIBLE);
67 }
68 EXPORT_SYMBOL(__lock_buffer);
69 
unlock_buffer(struct buffer_head * bh)70 void unlock_buffer(struct buffer_head *bh)
71 {
72 	clear_bit_unlock(BH_Lock, &bh->b_state);
73 	smp_mb__after_clear_bit();
74 	wake_up_bit(&bh->b_state, BH_Lock);
75 }
76 EXPORT_SYMBOL(unlock_buffer);
77 
78 /*
79  * Block until a buffer comes unlocked.  This doesn't stop it
80  * from becoming locked again - you have to lock it yourself
81  * if you want to preserve its state.
82  */
__wait_on_buffer(struct buffer_head * bh)83 void __wait_on_buffer(struct buffer_head * bh)
84 {
85 	wait_on_bit(&bh->b_state, BH_Lock, sleep_on_buffer, TASK_UNINTERRUPTIBLE);
86 }
87 EXPORT_SYMBOL(__wait_on_buffer);
88 
89 static void
__clear_page_buffers(struct page * page)90 __clear_page_buffers(struct page *page)
91 {
92 	ClearPagePrivate(page);
93 	set_page_private(page, 0);
94 	page_cache_release(page);
95 }
96 
97 
quiet_error(struct buffer_head * bh)98 static int quiet_error(struct buffer_head *bh)
99 {
100 	if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
101 		return 0;
102 	return 1;
103 }
104 
105 
buffer_io_error(struct buffer_head * bh)106 static void buffer_io_error(struct buffer_head *bh)
107 {
108 	char b[BDEVNAME_SIZE];
109 	printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
110 			bdevname(bh->b_bdev, b),
111 			(unsigned long long)bh->b_blocknr);
112 }
113 
114 /*
115  * End-of-IO handler helper function which does not touch the bh after
116  * unlocking it.
117  * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
118  * a race there is benign: unlock_buffer() only use the bh's address for
119  * hashing after unlocking the buffer, so it doesn't actually touch the bh
120  * itself.
121  */
__end_buffer_read_notouch(struct buffer_head * bh,int uptodate)122 static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
123 {
124 	if (uptodate) {
125 		set_buffer_uptodate(bh);
126 	} else {
127 		/* This happens, due to failed READA attempts. */
128 		clear_buffer_uptodate(bh);
129 	}
130 	unlock_buffer(bh);
131 }
132 
133 /*
134  * Default synchronous end-of-IO handler..  Just mark it up-to-date and
135  * unlock the buffer. This is what ll_rw_block uses too.
136  */
end_buffer_read_sync(struct buffer_head * bh,int uptodate)137 void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
138 {
139 	__end_buffer_read_notouch(bh, uptodate);
140 	put_bh(bh);
141 }
142 EXPORT_SYMBOL(end_buffer_read_sync);
143 
end_buffer_write_sync(struct buffer_head * bh,int uptodate)144 void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
145 {
146 	char b[BDEVNAME_SIZE];
147 
148 	if (uptodate) {
149 		set_buffer_uptodate(bh);
150 	} else {
151 		if (!quiet_error(bh)) {
152 			buffer_io_error(bh);
153 			printk(KERN_WARNING "lost page write due to "
154 					"I/O error on %s\n",
155 				       bdevname(bh->b_bdev, b));
156 		}
157 		set_buffer_write_io_error(bh);
158 		clear_buffer_uptodate(bh);
159 	}
160 	unlock_buffer(bh);
161 	put_bh(bh);
162 }
163 EXPORT_SYMBOL(end_buffer_write_sync);
164 
165 /*
166  * Various filesystems appear to want __find_get_block to be non-blocking.
167  * But it's the page lock which protects the buffers.  To get around this,
168  * we get exclusion from try_to_free_buffers with the blockdev mapping's
169  * private_lock.
170  *
171  * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
172  * may be quite high.  This code could TryLock the page, and if that
173  * succeeds, there is no need to take private_lock. (But if
174  * private_lock is contended then so is mapping->tree_lock).
175  */
176 static struct buffer_head *
__find_get_block_slow(struct block_device * bdev,sector_t block)177 __find_get_block_slow(struct block_device *bdev, sector_t block)
178 {
179 	struct inode *bd_inode = bdev->bd_inode;
180 	struct address_space *bd_mapping = bd_inode->i_mapping;
181 	struct buffer_head *ret = NULL;
182 	pgoff_t index;
183 	struct buffer_head *bh;
184 	struct buffer_head *head;
185 	struct page *page;
186 	int all_mapped = 1;
187 
188 	index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
189 	page = find_get_page(bd_mapping, index);
190 	if (!page)
191 		goto out;
192 
193 	spin_lock(&bd_mapping->private_lock);
194 	if (!page_has_buffers(page))
195 		goto out_unlock;
196 	head = page_buffers(page);
197 	bh = head;
198 	do {
199 		if (!buffer_mapped(bh))
200 			all_mapped = 0;
201 		else if (bh->b_blocknr == block) {
202 			ret = bh;
203 			get_bh(bh);
204 			goto out_unlock;
205 		}
206 		bh = bh->b_this_page;
207 	} while (bh != head);
208 
209 	/* we might be here because some of the buffers on this page are
210 	 * not mapped.  This is due to various races between
211 	 * file io on the block device and getblk.  It gets dealt with
212 	 * elsewhere, don't buffer_error if we had some unmapped buffers
213 	 */
214 	if (all_mapped) {
215 		printk("__find_get_block_slow() failed. "
216 			"block=%llu, b_blocknr=%llu\n",
217 			(unsigned long long)block,
218 			(unsigned long long)bh->b_blocknr);
219 		printk("b_state=0x%08lx, b_size=%zu\n",
220 			bh->b_state, bh->b_size);
221 		printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
222 	}
223 out_unlock:
224 	spin_unlock(&bd_mapping->private_lock);
225 	page_cache_release(page);
226 out:
227 	return ret;
228 }
229 
230 /* If invalidate_buffers() will trash dirty buffers, it means some kind
231    of fs corruption is going on. Trashing dirty data always imply losing
232    information that was supposed to be just stored on the physical layer
233    by the user.
234 
235    Thus invalidate_buffers in general usage is not allwowed to trash
236    dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
237    be preserved.  These buffers are simply skipped.
238 
239    We also skip buffers which are still in use.  For example this can
240    happen if a userspace program is reading the block device.
241 
242    NOTE: In the case where the user removed a removable-media-disk even if
243    there's still dirty data not synced on disk (due a bug in the device driver
244    or due an error of the user), by not destroying the dirty buffers we could
245    generate corruption also on the next media inserted, thus a parameter is
246    necessary to handle this case in the most safe way possible (trying
247    to not corrupt also the new disk inserted with the data belonging to
248    the old now corrupted disk). Also for the ramdisk the natural thing
249    to do in order to release the ramdisk memory is to destroy dirty buffers.
250 
251    These are two special cases. Normal usage imply the device driver
252    to issue a sync on the device (without waiting I/O completion) and
253    then an invalidate_buffers call that doesn't trash dirty buffers.
254 
255    For handling cache coherency with the blkdev pagecache the 'update' case
256    is been introduced. It is needed to re-read from disk any pinned
257    buffer. NOTE: re-reading from disk is destructive so we can do it only
258    when we assume nobody is changing the buffercache under our I/O and when
259    we think the disk contains more recent information than the buffercache.
260    The update == 1 pass marks the buffers we need to update, the update == 2
261    pass does the actual I/O. */
invalidate_bdev(struct block_device * bdev)262 void invalidate_bdev(struct block_device *bdev)
263 {
264 	struct address_space *mapping = bdev->bd_inode->i_mapping;
265 
266 	if (mapping->nrpages == 0)
267 		return;
268 
269 	invalidate_bh_lrus();
270 	lru_add_drain_all();	/* make sure all lru add caches are flushed */
271 	invalidate_mapping_pages(mapping, 0, -1);
272 }
273 EXPORT_SYMBOL(invalidate_bdev);
274 
275 /*
276  * Kick the writeback threads then try to free up some ZONE_NORMAL memory.
277  */
free_more_memory(void)278 static void free_more_memory(void)
279 {
280 	struct zone *zone;
281 	int nid;
282 
283 	wakeup_flusher_threads(1024);
284 	yield();
285 
286 	for_each_online_node(nid) {
287 		(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
288 						gfp_zone(GFP_NOFS), NULL,
289 						&zone);
290 		if (zone)
291 			try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
292 						GFP_NOFS, NULL);
293 	}
294 }
295 
296 /*
297  * I/O completion handler for block_read_full_page() - pages
298  * which come unlocked at the end of I/O.
299  */
end_buffer_async_read(struct buffer_head * bh,int uptodate)300 static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
301 {
302 	unsigned long flags;
303 	struct buffer_head *first;
304 	struct buffer_head *tmp;
305 	struct page *page;
306 	int page_uptodate = 1;
307 
308 	BUG_ON(!buffer_async_read(bh));
309 
310 	page = bh->b_page;
311 	if (uptodate) {
312 		set_buffer_uptodate(bh);
313 	} else {
314 		clear_buffer_uptodate(bh);
315 		if (!quiet_error(bh))
316 			buffer_io_error(bh);
317 		SetPageError(page);
318 	}
319 
320 	/*
321 	 * Be _very_ careful from here on. Bad things can happen if
322 	 * two buffer heads end IO at almost the same time and both
323 	 * decide that the page is now completely done.
324 	 */
325 	first = page_buffers(page);
326 	local_irq_save(flags);
327 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
328 	clear_buffer_async_read(bh);
329 	unlock_buffer(bh);
330 	tmp = bh;
331 	do {
332 		if (!buffer_uptodate(tmp))
333 			page_uptodate = 0;
334 		if (buffer_async_read(tmp)) {
335 			BUG_ON(!buffer_locked(tmp));
336 			goto still_busy;
337 		}
338 		tmp = tmp->b_this_page;
339 	} while (tmp != bh);
340 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
341 	local_irq_restore(flags);
342 
343 	/*
344 	 * If none of the buffers had errors and they are all
345 	 * uptodate then we can set the page uptodate.
346 	 */
347 	if (page_uptodate && !PageError(page))
348 		SetPageUptodate(page);
349 	unlock_page(page);
350 	return;
351 
352 still_busy:
353 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
354 	local_irq_restore(flags);
355 	return;
356 }
357 
358 /*
359  * Completion handler for block_write_full_page() - pages which are unlocked
360  * during I/O, and which have PageWriteback cleared upon I/O completion.
361  */
end_buffer_async_write(struct buffer_head * bh,int uptodate)362 void end_buffer_async_write(struct buffer_head *bh, int uptodate)
363 {
364 	char b[BDEVNAME_SIZE];
365 	unsigned long flags;
366 	struct buffer_head *first;
367 	struct buffer_head *tmp;
368 	struct page *page;
369 
370 	BUG_ON(!buffer_async_write(bh));
371 
372 	page = bh->b_page;
373 	if (uptodate) {
374 		set_buffer_uptodate(bh);
375 	} else {
376 		if (!quiet_error(bh)) {
377 			buffer_io_error(bh);
378 			printk(KERN_WARNING "lost page write due to "
379 					"I/O error on %s\n",
380 			       bdevname(bh->b_bdev, b));
381 		}
382 		set_bit(AS_EIO, &page->mapping->flags);
383 		set_buffer_write_io_error(bh);
384 		clear_buffer_uptodate(bh);
385 		SetPageError(page);
386 	}
387 
388 	first = page_buffers(page);
389 	local_irq_save(flags);
390 	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
391 
392 	clear_buffer_async_write(bh);
393 	unlock_buffer(bh);
394 	tmp = bh->b_this_page;
395 	while (tmp != bh) {
396 		if (buffer_async_write(tmp)) {
397 			BUG_ON(!buffer_locked(tmp));
398 			goto still_busy;
399 		}
400 		tmp = tmp->b_this_page;
401 	}
402 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
403 	local_irq_restore(flags);
404 	end_page_writeback(page);
405 	return;
406 
407 still_busy:
408 	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
409 	local_irq_restore(flags);
410 	return;
411 }
412 EXPORT_SYMBOL(end_buffer_async_write);
413 
414 /*
415  * If a page's buffers are under async readin (end_buffer_async_read
416  * completion) then there is a possibility that another thread of
417  * control could lock one of the buffers after it has completed
418  * but while some of the other buffers have not completed.  This
419  * locked buffer would confuse end_buffer_async_read() into not unlocking
420  * the page.  So the absence of BH_Async_Read tells end_buffer_async_read()
421  * that this buffer is not under async I/O.
422  *
423  * The page comes unlocked when it has no locked buffer_async buffers
424  * left.
425  *
426  * PageLocked prevents anyone starting new async I/O reads any of
427  * the buffers.
428  *
429  * PageWriteback is used to prevent simultaneous writeout of the same
430  * page.
431  *
432  * PageLocked prevents anyone from starting writeback of a page which is
433  * under read I/O (PageWriteback is only ever set against a locked page).
434  */
mark_buffer_async_read(struct buffer_head * bh)435 static void mark_buffer_async_read(struct buffer_head *bh)
436 {
437 	bh->b_end_io = end_buffer_async_read;
438 	set_buffer_async_read(bh);
439 }
440 
mark_buffer_async_write_endio(struct buffer_head * bh,bh_end_io_t * handler)441 static void mark_buffer_async_write_endio(struct buffer_head *bh,
442 					  bh_end_io_t *handler)
443 {
444 	bh->b_end_io = handler;
445 	set_buffer_async_write(bh);
446 }
447 
mark_buffer_async_write(struct buffer_head * bh)448 void mark_buffer_async_write(struct buffer_head *bh)
449 {
450 	mark_buffer_async_write_endio(bh, end_buffer_async_write);
451 }
452 EXPORT_SYMBOL(mark_buffer_async_write);
453 
454 
455 /*
456  * fs/buffer.c contains helper functions for buffer-backed address space's
457  * fsync functions.  A common requirement for buffer-based filesystems is
458  * that certain data from the backing blockdev needs to be written out for
459  * a successful fsync().  For example, ext2 indirect blocks need to be
460  * written back and waited upon before fsync() returns.
461  *
462  * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
463  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
464  * management of a list of dependent buffers at ->i_mapping->private_list.
465  *
466  * Locking is a little subtle: try_to_free_buffers() will remove buffers
467  * from their controlling inode's queue when they are being freed.  But
468  * try_to_free_buffers() will be operating against the *blockdev* mapping
469  * at the time, not against the S_ISREG file which depends on those buffers.
470  * So the locking for private_list is via the private_lock in the address_space
471  * which backs the buffers.  Which is different from the address_space
472  * against which the buffers are listed.  So for a particular address_space,
473  * mapping->private_lock does *not* protect mapping->private_list!  In fact,
474  * mapping->private_list will always be protected by the backing blockdev's
475  * ->private_lock.
476  *
477  * Which introduces a requirement: all buffers on an address_space's
478  * ->private_list must be from the same address_space: the blockdev's.
479  *
480  * address_spaces which do not place buffers at ->private_list via these
481  * utility functions are free to use private_lock and private_list for
482  * whatever they want.  The only requirement is that list_empty(private_list)
483  * be true at clear_inode() time.
484  *
485  * FIXME: clear_inode should not call invalidate_inode_buffers().  The
486  * filesystems should do that.  invalidate_inode_buffers() should just go
487  * BUG_ON(!list_empty).
488  *
489  * FIXME: mark_buffer_dirty_inode() is a data-plane operation.  It should
490  * take an address_space, not an inode.  And it should be called
491  * mark_buffer_dirty_fsync() to clearly define why those buffers are being
492  * queued up.
493  *
494  * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
495  * list if it is already on a list.  Because if the buffer is on a list,
496  * it *must* already be on the right one.  If not, the filesystem is being
497  * silly.  This will save a ton of locking.  But first we have to ensure
498  * that buffers are taken *off* the old inode's list when they are freed
499  * (presumably in truncate).  That requires careful auditing of all
500  * filesystems (do it inside bforget()).  It could also be done by bringing
501  * b_inode back.
502  */
503 
504 /*
505  * The buffer's backing address_space's private_lock must be held
506  */
__remove_assoc_queue(struct buffer_head * bh)507 static void __remove_assoc_queue(struct buffer_head *bh)
508 {
509 	list_del_init(&bh->b_assoc_buffers);
510 	WARN_ON(!bh->b_assoc_map);
511 	if (buffer_write_io_error(bh))
512 		set_bit(AS_EIO, &bh->b_assoc_map->flags);
513 	bh->b_assoc_map = NULL;
514 }
515 
inode_has_buffers(struct inode * inode)516 int inode_has_buffers(struct inode *inode)
517 {
518 	return !list_empty(&inode->i_data.private_list);
519 }
520 
521 /*
522  * osync is designed to support O_SYNC io.  It waits synchronously for
523  * all already-submitted IO to complete, but does not queue any new
524  * writes to the disk.
525  *
526  * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
527  * you dirty the buffers, and then use osync_inode_buffers to wait for
528  * completion.  Any other dirty buffers which are not yet queued for
529  * write will not be flushed to disk by the osync.
530  */
osync_buffers_list(spinlock_t * lock,struct list_head * list)531 static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
532 {
533 	struct buffer_head *bh;
534 	struct list_head *p;
535 	int err = 0;
536 
537 	spin_lock(lock);
538 repeat:
539 	list_for_each_prev(p, list) {
540 		bh = BH_ENTRY(p);
541 		if (buffer_locked(bh)) {
542 			get_bh(bh);
543 			spin_unlock(lock);
544 			wait_on_buffer(bh);
545 			if (!buffer_uptodate(bh))
546 				err = -EIO;
547 			brelse(bh);
548 			spin_lock(lock);
549 			goto repeat;
550 		}
551 	}
552 	spin_unlock(lock);
553 	return err;
554 }
555 
do_thaw_one(struct super_block * sb,void * unused)556 static void do_thaw_one(struct super_block *sb, void *unused)
557 {
558 	char b[BDEVNAME_SIZE];
559 	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
560 		printk(KERN_WARNING "Emergency Thaw on %s\n",
561 		       bdevname(sb->s_bdev, b));
562 }
563 
do_thaw_all(struct work_struct * work)564 static void do_thaw_all(struct work_struct *work)
565 {
566 	iterate_supers(do_thaw_one, NULL);
567 	kfree(work);
568 	printk(KERN_WARNING "Emergency Thaw complete\n");
569 }
570 
571 /**
572  * emergency_thaw_all -- forcibly thaw every frozen filesystem
573  *
574  * Used for emergency unfreeze of all filesystems via SysRq
575  */
emergency_thaw_all(void)576 void emergency_thaw_all(void)
577 {
578 	struct work_struct *work;
579 
580 	work = kmalloc(sizeof(*work), GFP_ATOMIC);
581 	if (work) {
582 		INIT_WORK(work, do_thaw_all);
583 		schedule_work(work);
584 	}
585 }
586 
587 /**
588  * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
589  * @mapping: the mapping which wants those buffers written
590  *
591  * Starts I/O against the buffers at mapping->private_list, and waits upon
592  * that I/O.
593  *
594  * Basically, this is a convenience function for fsync().
595  * @mapping is a file or directory which needs those buffers to be written for
596  * a successful fsync().
597  */
sync_mapping_buffers(struct address_space * mapping)598 int sync_mapping_buffers(struct address_space *mapping)
599 {
600 	struct address_space *buffer_mapping = mapping->assoc_mapping;
601 
602 	if (buffer_mapping == NULL || list_empty(&mapping->private_list))
603 		return 0;
604 
605 	return fsync_buffers_list(&buffer_mapping->private_lock,
606 					&mapping->private_list);
607 }
608 EXPORT_SYMBOL(sync_mapping_buffers);
609 
610 /*
611  * Called when we've recently written block `bblock', and it is known that
612  * `bblock' was for a buffer_boundary() buffer.  This means that the block at
613  * `bblock + 1' is probably a dirty indirect block.  Hunt it down and, if it's
614  * dirty, schedule it for IO.  So that indirects merge nicely with their data.
615  */
write_boundary_block(struct block_device * bdev,sector_t bblock,unsigned blocksize)616 void write_boundary_block(struct block_device *bdev,
617 			sector_t bblock, unsigned blocksize)
618 {
619 	struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
620 	if (bh) {
621 		if (buffer_dirty(bh))
622 			ll_rw_block(WRITE, 1, &bh);
623 		put_bh(bh);
624 	}
625 }
626 
mark_buffer_dirty_inode(struct buffer_head * bh,struct inode * inode)627 void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
628 {
629 	struct address_space *mapping = inode->i_mapping;
630 	struct address_space *buffer_mapping = bh->b_page->mapping;
631 
632 	mark_buffer_dirty(bh);
633 	if (!mapping->assoc_mapping) {
634 		mapping->assoc_mapping = buffer_mapping;
635 	} else {
636 		BUG_ON(mapping->assoc_mapping != buffer_mapping);
637 	}
638 	if (!bh->b_assoc_map) {
639 		spin_lock(&buffer_mapping->private_lock);
640 		list_move_tail(&bh->b_assoc_buffers,
641 				&mapping->private_list);
642 		bh->b_assoc_map = mapping;
643 		spin_unlock(&buffer_mapping->private_lock);
644 	}
645 }
646 EXPORT_SYMBOL(mark_buffer_dirty_inode);
647 
648 /*
649  * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
650  * dirty.
651  *
652  * If warn is true, then emit a warning if the page is not uptodate and has
653  * not been truncated.
654  */
__set_page_dirty(struct page * page,struct address_space * mapping,int warn)655 static void __set_page_dirty(struct page *page,
656 		struct address_space *mapping, int warn)
657 {
658 	spin_lock_irq(&mapping->tree_lock);
659 	if (page->mapping) {	/* Race with truncate? */
660 		WARN_ON_ONCE(warn && !PageUptodate(page));
661 		account_page_dirtied(page, mapping);
662 		radix_tree_tag_set(&mapping->page_tree,
663 				page_index(page), PAGECACHE_TAG_DIRTY);
664 	}
665 	spin_unlock_irq(&mapping->tree_lock);
666 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
667 }
668 
669 /*
670  * Add a page to the dirty page list.
671  *
672  * It is a sad fact of life that this function is called from several places
673  * deeply under spinlocking.  It may not sleep.
674  *
675  * If the page has buffers, the uptodate buffers are set dirty, to preserve
676  * dirty-state coherency between the page and the buffers.  It the page does
677  * not have buffers then when they are later attached they will all be set
678  * dirty.
679  *
680  * The buffers are dirtied before the page is dirtied.  There's a small race
681  * window in which a writepage caller may see the page cleanness but not the
682  * buffer dirtiness.  That's fine.  If this code were to set the page dirty
683  * before the buffers, a concurrent writepage caller could clear the page dirty
684  * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
685  * page on the dirty page list.
686  *
687  * We use private_lock to lock against try_to_free_buffers while using the
688  * page's buffer list.  Also use this to protect against clean buffers being
689  * added to the page after it was set dirty.
690  *
691  * FIXME: may need to call ->reservepage here as well.  That's rather up to the
692  * address_space though.
693  */
__set_page_dirty_buffers(struct page * page)694 int __set_page_dirty_buffers(struct page *page)
695 {
696 	int newly_dirty;
697 	struct address_space *mapping = page_mapping(page);
698 
699 	if (unlikely(!mapping))
700 		return !TestSetPageDirty(page);
701 
702 	spin_lock(&mapping->private_lock);
703 	if (page_has_buffers(page)) {
704 		struct buffer_head *head = page_buffers(page);
705 		struct buffer_head *bh = head;
706 
707 		do {
708 			set_buffer_dirty(bh);
709 			bh = bh->b_this_page;
710 		} while (bh != head);
711 	}
712 	newly_dirty = !TestSetPageDirty(page);
713 	spin_unlock(&mapping->private_lock);
714 
715 	if (newly_dirty)
716 		__set_page_dirty(page, mapping, 1);
717 	return newly_dirty;
718 }
719 EXPORT_SYMBOL(__set_page_dirty_buffers);
720 
721 /*
722  * Write out and wait upon a list of buffers.
723  *
724  * We have conflicting pressures: we want to make sure that all
725  * initially dirty buffers get waited on, but that any subsequently
726  * dirtied buffers don't.  After all, we don't want fsync to last
727  * forever if somebody is actively writing to the file.
728  *
729  * Do this in two main stages: first we copy dirty buffers to a
730  * temporary inode list, queueing the writes as we go.  Then we clean
731  * up, waiting for those writes to complete.
732  *
733  * During this second stage, any subsequent updates to the file may end
734  * up refiling the buffer on the original inode's dirty list again, so
735  * there is a chance we will end up with a buffer queued for write but
736  * not yet completed on that list.  So, as a final cleanup we go through
737  * the osync code to catch these locked, dirty buffers without requeuing
738  * any newly dirty buffers for write.
739  */
fsync_buffers_list(spinlock_t * lock,struct list_head * list)740 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
741 {
742 	struct buffer_head *bh;
743 	struct list_head tmp;
744 	struct address_space *mapping;
745 	int err = 0, err2;
746 	struct blk_plug plug;
747 
748 	INIT_LIST_HEAD(&tmp);
749 	blk_start_plug(&plug);
750 
751 	spin_lock(lock);
752 	while (!list_empty(list)) {
753 		bh = BH_ENTRY(list->next);
754 		mapping = bh->b_assoc_map;
755 		__remove_assoc_queue(bh);
756 		/* Avoid race with mark_buffer_dirty_inode() which does
757 		 * a lockless check and we rely on seeing the dirty bit */
758 		smp_mb();
759 		if (buffer_dirty(bh) || buffer_locked(bh)) {
760 			list_add(&bh->b_assoc_buffers, &tmp);
761 			bh->b_assoc_map = mapping;
762 			if (buffer_dirty(bh)) {
763 				get_bh(bh);
764 				spin_unlock(lock);
765 				/*
766 				 * Ensure any pending I/O completes so that
767 				 * write_dirty_buffer() actually writes the
768 				 * current contents - it is a noop if I/O is
769 				 * still in flight on potentially older
770 				 * contents.
771 				 */
772 				write_dirty_buffer(bh, WRITE_SYNC);
773 
774 				/*
775 				 * Kick off IO for the previous mapping. Note
776 				 * that we will not run the very last mapping,
777 				 * wait_on_buffer() will do that for us
778 				 * through sync_buffer().
779 				 */
780 				brelse(bh);
781 				spin_lock(lock);
782 			}
783 		}
784 	}
785 
786 	spin_unlock(lock);
787 	blk_finish_plug(&plug);
788 	spin_lock(lock);
789 
790 	while (!list_empty(&tmp)) {
791 		bh = BH_ENTRY(tmp.prev);
792 		get_bh(bh);
793 		mapping = bh->b_assoc_map;
794 		__remove_assoc_queue(bh);
795 		/* Avoid race with mark_buffer_dirty_inode() which does
796 		 * a lockless check and we rely on seeing the dirty bit */
797 		smp_mb();
798 		if (buffer_dirty(bh)) {
799 			list_add(&bh->b_assoc_buffers,
800 				 &mapping->private_list);
801 			bh->b_assoc_map = mapping;
802 		}
803 		spin_unlock(lock);
804 		wait_on_buffer(bh);
805 		if (!buffer_uptodate(bh))
806 			err = -EIO;
807 		brelse(bh);
808 		spin_lock(lock);
809 	}
810 
811 	spin_unlock(lock);
812 	err2 = osync_buffers_list(lock, list);
813 	if (err)
814 		return err;
815 	else
816 		return err2;
817 }
818 
819 /*
820  * Invalidate any and all dirty buffers on a given inode.  We are
821  * probably unmounting the fs, but that doesn't mean we have already
822  * done a sync().  Just drop the buffers from the inode list.
823  *
824  * NOTE: we take the inode's blockdev's mapping's private_lock.  Which
825  * assumes that all the buffers are against the blockdev.  Not true
826  * for reiserfs.
827  */
invalidate_inode_buffers(struct inode * inode)828 void invalidate_inode_buffers(struct inode *inode)
829 {
830 	if (inode_has_buffers(inode)) {
831 		struct address_space *mapping = &inode->i_data;
832 		struct list_head *list = &mapping->private_list;
833 		struct address_space *buffer_mapping = mapping->assoc_mapping;
834 
835 		spin_lock(&buffer_mapping->private_lock);
836 		while (!list_empty(list))
837 			__remove_assoc_queue(BH_ENTRY(list->next));
838 		spin_unlock(&buffer_mapping->private_lock);
839 	}
840 }
841 EXPORT_SYMBOL(invalidate_inode_buffers);
842 
843 /*
844  * Remove any clean buffers from the inode's buffer list.  This is called
845  * when we're trying to free the inode itself.  Those buffers can pin it.
846  *
847  * Returns true if all buffers were removed.
848  */
remove_inode_buffers(struct inode * inode)849 int remove_inode_buffers(struct inode *inode)
850 {
851 	int ret = 1;
852 
853 	if (inode_has_buffers(inode)) {
854 		struct address_space *mapping = &inode->i_data;
855 		struct list_head *list = &mapping->private_list;
856 		struct address_space *buffer_mapping = mapping->assoc_mapping;
857 
858 		spin_lock(&buffer_mapping->private_lock);
859 		while (!list_empty(list)) {
860 			struct buffer_head *bh = BH_ENTRY(list->next);
861 			if (buffer_dirty(bh)) {
862 				ret = 0;
863 				break;
864 			}
865 			__remove_assoc_queue(bh);
866 		}
867 		spin_unlock(&buffer_mapping->private_lock);
868 	}
869 	return ret;
870 }
871 
872 /*
873  * Create the appropriate buffers when given a page for data area and
874  * the size of each buffer.. Use the bh->b_this_page linked list to
875  * follow the buffers created.  Return NULL if unable to create more
876  * buffers.
877  *
878  * The retry flag is used to differentiate async IO (paging, swapping)
879  * which may not fail from ordinary buffer allocations.
880  */
alloc_page_buffers(struct page * page,unsigned long size,int retry)881 struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
882 		int retry)
883 {
884 	struct buffer_head *bh, *head;
885 	long offset;
886 
887 try_again:
888 	head = NULL;
889 	offset = PAGE_SIZE;
890 	while ((offset -= size) >= 0) {
891 		bh = alloc_buffer_head(GFP_NOFS);
892 		if (!bh)
893 			goto no_grow;
894 
895 		bh->b_bdev = NULL;
896 		bh->b_this_page = head;
897 		bh->b_blocknr = -1;
898 		head = bh;
899 
900 		bh->b_state = 0;
901 		atomic_set(&bh->b_count, 0);
902 		bh->b_size = size;
903 
904 		/* Link the buffer to its page */
905 		set_bh_page(bh, page, offset);
906 
907 		init_buffer(bh, NULL, NULL);
908 	}
909 	return head;
910 /*
911  * In case anything failed, we just free everything we got.
912  */
913 no_grow:
914 	if (head) {
915 		do {
916 			bh = head;
917 			head = head->b_this_page;
918 			free_buffer_head(bh);
919 		} while (head);
920 	}
921 
922 	/*
923 	 * Return failure for non-async IO requests.  Async IO requests
924 	 * are not allowed to fail, so we have to wait until buffer heads
925 	 * become available.  But we don't want tasks sleeping with
926 	 * partially complete buffers, so all were released above.
927 	 */
928 	if (!retry)
929 		return NULL;
930 
931 	/* We're _really_ low on memory. Now we just
932 	 * wait for old buffer heads to become free due to
933 	 * finishing IO.  Since this is an async request and
934 	 * the reserve list is empty, we're sure there are
935 	 * async buffer heads in use.
936 	 */
937 	free_more_memory();
938 	goto try_again;
939 }
940 EXPORT_SYMBOL_GPL(alloc_page_buffers);
941 
942 static inline void
link_dev_buffers(struct page * page,struct buffer_head * head)943 link_dev_buffers(struct page *page, struct buffer_head *head)
944 {
945 	struct buffer_head *bh, *tail;
946 
947 	bh = head;
948 	do {
949 		tail = bh;
950 		bh = bh->b_this_page;
951 	} while (bh);
952 	tail->b_this_page = head;
953 	attach_page_buffers(page, head);
954 }
955 
956 /*
957  * Initialise the state of a blockdev page's buffers.
958  */
959 static void
init_page_buffers(struct page * page,struct block_device * bdev,sector_t block,int size)960 init_page_buffers(struct page *page, struct block_device *bdev,
961 			sector_t block, int size)
962 {
963 	struct buffer_head *head = page_buffers(page);
964 	struct buffer_head *bh = head;
965 	int uptodate = PageUptodate(page);
966 
967 	do {
968 		if (!buffer_mapped(bh)) {
969 			init_buffer(bh, NULL, NULL);
970 			bh->b_bdev = bdev;
971 			bh->b_blocknr = block;
972 			if (uptodate)
973 				set_buffer_uptodate(bh);
974 			set_buffer_mapped(bh);
975 		}
976 		block++;
977 		bh = bh->b_this_page;
978 	} while (bh != head);
979 }
980 
981 /*
982  * Create the page-cache page that contains the requested block.
983  *
984  * This is user purely for blockdev mappings.
985  */
986 static struct page *
grow_dev_page(struct block_device * bdev,sector_t block,pgoff_t index,int size)987 grow_dev_page(struct block_device *bdev, sector_t block,
988 		pgoff_t index, int size)
989 {
990 	struct inode *inode = bdev->bd_inode;
991 	struct page *page;
992 	struct buffer_head *bh;
993 
994 	page = find_or_create_page(inode->i_mapping, index,
995 		(mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
996 	if (!page)
997 		return NULL;
998 
999 	BUG_ON(!PageLocked(page));
1000 
1001 	if (page_has_buffers(page)) {
1002 		bh = page_buffers(page);
1003 		if (bh->b_size == size) {
1004 			init_page_buffers(page, bdev, block, size);
1005 			return page;
1006 		}
1007 		if (!try_to_free_buffers(page))
1008 			goto failed;
1009 	}
1010 
1011 	/*
1012 	 * Allocate some buffers for this page
1013 	 */
1014 	bh = alloc_page_buffers(page, size, 0);
1015 	if (!bh)
1016 		goto failed;
1017 
1018 	/*
1019 	 * Link the page to the buffers and initialise them.  Take the
1020 	 * lock to be atomic wrt __find_get_block(), which does not
1021 	 * run under the page lock.
1022 	 */
1023 	spin_lock(&inode->i_mapping->private_lock);
1024 	link_dev_buffers(page, bh);
1025 	init_page_buffers(page, bdev, block, size);
1026 	spin_unlock(&inode->i_mapping->private_lock);
1027 	return page;
1028 
1029 failed:
1030 	BUG();
1031 	unlock_page(page);
1032 	page_cache_release(page);
1033 	return NULL;
1034 }
1035 
1036 /*
1037  * Create buffers for the specified block device block's page.  If
1038  * that page was dirty, the buffers are set dirty also.
1039  */
1040 static int
grow_buffers(struct block_device * bdev,sector_t block,int size)1041 grow_buffers(struct block_device *bdev, sector_t block, int size)
1042 {
1043 	struct page *page;
1044 	pgoff_t index;
1045 	int sizebits;
1046 
1047 	sizebits = -1;
1048 	do {
1049 		sizebits++;
1050 	} while ((size << sizebits) < PAGE_SIZE);
1051 
1052 	index = block >> sizebits;
1053 
1054 	/*
1055 	 * Check for a block which wants to lie outside our maximum possible
1056 	 * pagecache index.  (this comparison is done using sector_t types).
1057 	 */
1058 	if (unlikely(index != block >> sizebits)) {
1059 		char b[BDEVNAME_SIZE];
1060 
1061 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
1062 			"device %s\n",
1063 			__func__, (unsigned long long)block,
1064 			bdevname(bdev, b));
1065 		return -EIO;
1066 	}
1067 	block = index << sizebits;
1068 	/* Create a page with the proper size buffers.. */
1069 	page = grow_dev_page(bdev, block, index, size);
1070 	if (!page)
1071 		return 0;
1072 	unlock_page(page);
1073 	page_cache_release(page);
1074 	return 1;
1075 }
1076 
1077 static struct buffer_head *
__getblk_slow(struct block_device * bdev,sector_t block,int size)1078 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1079 {
1080 	/* Size must be multiple of hard sectorsize */
1081 	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1082 			(size < 512 || size > PAGE_SIZE))) {
1083 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1084 					size);
1085 		printk(KERN_ERR "logical block size: %d\n",
1086 					bdev_logical_block_size(bdev));
1087 
1088 		dump_stack();
1089 		return NULL;
1090 	}
1091 
1092 	for (;;) {
1093 		struct buffer_head * bh;
1094 		int ret;
1095 
1096 		bh = __find_get_block(bdev, block, size);
1097 		if (bh)
1098 			return bh;
1099 
1100 		ret = grow_buffers(bdev, block, size);
1101 		if (ret < 0)
1102 			return NULL;
1103 		if (ret == 0)
1104 			free_more_memory();
1105 	}
1106 }
1107 
1108 /*
1109  * The relationship between dirty buffers and dirty pages:
1110  *
1111  * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1112  * the page is tagged dirty in its radix tree.
1113  *
1114  * At all times, the dirtiness of the buffers represents the dirtiness of
1115  * subsections of the page.  If the page has buffers, the page dirty bit is
1116  * merely a hint about the true dirty state.
1117  *
1118  * When a page is set dirty in its entirety, all its buffers are marked dirty
1119  * (if the page has buffers).
1120  *
1121  * When a buffer is marked dirty, its page is dirtied, but the page's other
1122  * buffers are not.
1123  *
1124  * Also.  When blockdev buffers are explicitly read with bread(), they
1125  * individually become uptodate.  But their backing page remains not
1126  * uptodate - even if all of its buffers are uptodate.  A subsequent
1127  * block_read_full_page() against that page will discover all the uptodate
1128  * buffers, will set the page uptodate and will perform no I/O.
1129  */
1130 
1131 /**
1132  * mark_buffer_dirty - mark a buffer_head as needing writeout
1133  * @bh: the buffer_head to mark dirty
1134  *
1135  * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1136  * backing page dirty, then tag the page as dirty in its address_space's radix
1137  * tree and then attach the address_space's inode to its superblock's dirty
1138  * inode list.
1139  *
1140  * mark_buffer_dirty() is atomic.  It takes bh->b_page->mapping->private_lock,
1141  * mapping->tree_lock and mapping->host->i_lock.
1142  */
mark_buffer_dirty(struct buffer_head * bh)1143 void mark_buffer_dirty(struct buffer_head *bh)
1144 {
1145 	WARN_ON_ONCE(!buffer_uptodate(bh));
1146 
1147 	/*
1148 	 * Very *carefully* optimize the it-is-already-dirty case.
1149 	 *
1150 	 * Don't let the final "is it dirty" escape to before we
1151 	 * perhaps modified the buffer.
1152 	 */
1153 	if (buffer_dirty(bh)) {
1154 		smp_mb();
1155 		if (buffer_dirty(bh))
1156 			return;
1157 	}
1158 
1159 	if (!test_set_buffer_dirty(bh)) {
1160 		struct page *page = bh->b_page;
1161 		if (!TestSetPageDirty(page)) {
1162 			struct address_space *mapping = page_mapping(page);
1163 			if (mapping)
1164 				__set_page_dirty(page, mapping, 0);
1165 		}
1166 	}
1167 }
1168 EXPORT_SYMBOL(mark_buffer_dirty);
1169 
1170 /*
1171  * Decrement a buffer_head's reference count.  If all buffers against a page
1172  * have zero reference count, are clean and unlocked, and if the page is clean
1173  * and unlocked then try_to_free_buffers() may strip the buffers from the page
1174  * in preparation for freeing it (sometimes, rarely, buffers are removed from
1175  * a page but it ends up not being freed, and buffers may later be reattached).
1176  */
__brelse(struct buffer_head * buf)1177 void __brelse(struct buffer_head * buf)
1178 {
1179 	if (atomic_read(&buf->b_count)) {
1180 		put_bh(buf);
1181 		return;
1182 	}
1183 	WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1184 }
1185 EXPORT_SYMBOL(__brelse);
1186 
1187 /*
1188  * bforget() is like brelse(), except it discards any
1189  * potentially dirty data.
1190  */
__bforget(struct buffer_head * bh)1191 void __bforget(struct buffer_head *bh)
1192 {
1193 	clear_buffer_dirty(bh);
1194 	if (bh->b_assoc_map) {
1195 		struct address_space *buffer_mapping = bh->b_page->mapping;
1196 
1197 		spin_lock(&buffer_mapping->private_lock);
1198 		list_del_init(&bh->b_assoc_buffers);
1199 		bh->b_assoc_map = NULL;
1200 		spin_unlock(&buffer_mapping->private_lock);
1201 	}
1202 	__brelse(bh);
1203 }
1204 EXPORT_SYMBOL(__bforget);
1205 
__bread_slow(struct buffer_head * bh)1206 static struct buffer_head *__bread_slow(struct buffer_head *bh)
1207 {
1208 	lock_buffer(bh);
1209 	if (buffer_uptodate(bh)) {
1210 		unlock_buffer(bh);
1211 		return bh;
1212 	} else {
1213 		get_bh(bh);
1214 		bh->b_end_io = end_buffer_read_sync;
1215 		submit_bh(READ, bh);
1216 		wait_on_buffer(bh);
1217 		if (buffer_uptodate(bh))
1218 			return bh;
1219 	}
1220 	brelse(bh);
1221 	return NULL;
1222 }
1223 
1224 /*
1225  * Per-cpu buffer LRU implementation.  To reduce the cost of __find_get_block().
1226  * The bhs[] array is sorted - newest buffer is at bhs[0].  Buffers have their
1227  * refcount elevated by one when they're in an LRU.  A buffer can only appear
1228  * once in a particular CPU's LRU.  A single buffer can be present in multiple
1229  * CPU's LRUs at the same time.
1230  *
1231  * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1232  * sb_find_get_block().
1233  *
1234  * The LRUs themselves only need locking against invalidate_bh_lrus.  We use
1235  * a local interrupt disable for that.
1236  */
1237 
1238 #define BH_LRU_SIZE	8
1239 
1240 struct bh_lru {
1241 	struct buffer_head *bhs[BH_LRU_SIZE];
1242 };
1243 
1244 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1245 
1246 #ifdef CONFIG_SMP
1247 #define bh_lru_lock()	local_irq_disable()
1248 #define bh_lru_unlock()	local_irq_enable()
1249 #else
1250 #define bh_lru_lock()	preempt_disable()
1251 #define bh_lru_unlock()	preempt_enable()
1252 #endif
1253 
check_irqs_on(void)1254 static inline void check_irqs_on(void)
1255 {
1256 #ifdef irqs_disabled
1257 	BUG_ON(irqs_disabled());
1258 #endif
1259 }
1260 
1261 /*
1262  * The LRU management algorithm is dopey-but-simple.  Sorry.
1263  */
bh_lru_install(struct buffer_head * bh)1264 static void bh_lru_install(struct buffer_head *bh)
1265 {
1266 	struct buffer_head *evictee = NULL;
1267 
1268 	check_irqs_on();
1269 	bh_lru_lock();
1270 	if (__this_cpu_read(bh_lrus.bhs[0]) != bh) {
1271 		struct buffer_head *bhs[BH_LRU_SIZE];
1272 		int in;
1273 		int out = 0;
1274 
1275 		get_bh(bh);
1276 		bhs[out++] = bh;
1277 		for (in = 0; in < BH_LRU_SIZE; in++) {
1278 			struct buffer_head *bh2 =
1279 				__this_cpu_read(bh_lrus.bhs[in]);
1280 
1281 			if (bh2 == bh) {
1282 				__brelse(bh2);
1283 			} else {
1284 				if (out >= BH_LRU_SIZE) {
1285 					BUG_ON(evictee != NULL);
1286 					evictee = bh2;
1287 				} else {
1288 					bhs[out++] = bh2;
1289 				}
1290 			}
1291 		}
1292 		while (out < BH_LRU_SIZE)
1293 			bhs[out++] = NULL;
1294 		memcpy(__this_cpu_ptr(&bh_lrus.bhs), bhs, sizeof(bhs));
1295 	}
1296 	bh_lru_unlock();
1297 
1298 	if (evictee)
1299 		__brelse(evictee);
1300 }
1301 
1302 /*
1303  * Look up the bh in this cpu's LRU.  If it's there, move it to the head.
1304  */
1305 static struct buffer_head *
lookup_bh_lru(struct block_device * bdev,sector_t block,unsigned size)1306 lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1307 {
1308 	struct buffer_head *ret = NULL;
1309 	unsigned int i;
1310 
1311 	check_irqs_on();
1312 	bh_lru_lock();
1313 	for (i = 0; i < BH_LRU_SIZE; i++) {
1314 		struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
1315 
1316 		if (bh && bh->b_bdev == bdev &&
1317 				bh->b_blocknr == block && bh->b_size == size) {
1318 			if (i) {
1319 				while (i) {
1320 					__this_cpu_write(bh_lrus.bhs[i],
1321 						__this_cpu_read(bh_lrus.bhs[i - 1]));
1322 					i--;
1323 				}
1324 				__this_cpu_write(bh_lrus.bhs[0], bh);
1325 			}
1326 			get_bh(bh);
1327 			ret = bh;
1328 			break;
1329 		}
1330 	}
1331 	bh_lru_unlock();
1332 	return ret;
1333 }
1334 
1335 /*
1336  * Perform a pagecache lookup for the matching buffer.  If it's there, refresh
1337  * it in the LRU and mark it as accessed.  If it is not present then return
1338  * NULL
1339  */
1340 struct buffer_head *
__find_get_block(struct block_device * bdev,sector_t block,unsigned size)1341 __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1342 {
1343 	struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1344 
1345 	if (bh == NULL) {
1346 		bh = __find_get_block_slow(bdev, block);
1347 		if (bh)
1348 			bh_lru_install(bh);
1349 	}
1350 	if (bh)
1351 		touch_buffer(bh);
1352 	return bh;
1353 }
1354 EXPORT_SYMBOL(__find_get_block);
1355 
1356 /*
1357  * __getblk will locate (and, if necessary, create) the buffer_head
1358  * which corresponds to the passed block_device, block and size. The
1359  * returned buffer has its reference count incremented.
1360  *
1361  * __getblk() cannot fail - it just keeps trying.  If you pass it an
1362  * illegal block number, __getblk() will happily return a buffer_head
1363  * which represents the non-existent block.  Very weird.
1364  *
1365  * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1366  * attempt is failing.  FIXME, perhaps?
1367  */
1368 struct buffer_head *
__getblk(struct block_device * bdev,sector_t block,unsigned size)1369 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1370 {
1371 	struct buffer_head *bh = __find_get_block(bdev, block, size);
1372 
1373 	might_sleep();
1374 	if (bh == NULL)
1375 		bh = __getblk_slow(bdev, block, size);
1376 	return bh;
1377 }
1378 EXPORT_SYMBOL(__getblk);
1379 
1380 /*
1381  * Do async read-ahead on a buffer..
1382  */
__breadahead(struct block_device * bdev,sector_t block,unsigned size)1383 void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1384 {
1385 	struct buffer_head *bh = __getblk(bdev, block, size);
1386 	if (likely(bh)) {
1387 		ll_rw_block(READA, 1, &bh);
1388 		brelse(bh);
1389 	}
1390 }
1391 EXPORT_SYMBOL(__breadahead);
1392 
1393 /**
1394  *  __bread() - reads a specified block and returns the bh
1395  *  @bdev: the block_device to read from
1396  *  @block: number of block
1397  *  @size: size (in bytes) to read
1398  *
1399  *  Reads a specified block, and returns buffer head that contains it.
1400  *  It returns NULL if the block was unreadable.
1401  */
1402 struct buffer_head *
__bread(struct block_device * bdev,sector_t block,unsigned size)1403 __bread(struct block_device *bdev, sector_t block, unsigned size)
1404 {
1405 	struct buffer_head *bh = __getblk(bdev, block, size);
1406 
1407 	if (likely(bh) && !buffer_uptodate(bh))
1408 		bh = __bread_slow(bh);
1409 	return bh;
1410 }
1411 EXPORT_SYMBOL(__bread);
1412 
1413 /*
1414  * invalidate_bh_lrus() is called rarely - but not only at unmount.
1415  * This doesn't race because it runs in each cpu either in irq
1416  * or with preempt disabled.
1417  */
invalidate_bh_lru(void * arg)1418 static void invalidate_bh_lru(void *arg)
1419 {
1420 	struct bh_lru *b = &get_cpu_var(bh_lrus);
1421 	int i;
1422 
1423 	for (i = 0; i < BH_LRU_SIZE; i++) {
1424 		brelse(b->bhs[i]);
1425 		b->bhs[i] = NULL;
1426 	}
1427 	put_cpu_var(bh_lrus);
1428 }
1429 
invalidate_bh_lrus(void)1430 void invalidate_bh_lrus(void)
1431 {
1432 	on_each_cpu(invalidate_bh_lru, NULL, 1);
1433 }
1434 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1435 
set_bh_page(struct buffer_head * bh,struct page * page,unsigned long offset)1436 void set_bh_page(struct buffer_head *bh,
1437 		struct page *page, unsigned long offset)
1438 {
1439 	bh->b_page = page;
1440 	BUG_ON(offset >= PAGE_SIZE);
1441 	if (PageHighMem(page))
1442 		/*
1443 		 * This catches illegal uses and preserves the offset:
1444 		 */
1445 		bh->b_data = (char *)(0 + offset);
1446 	else
1447 		bh->b_data = page_address(page) + offset;
1448 }
1449 EXPORT_SYMBOL(set_bh_page);
1450 
1451 /*
1452  * Called when truncating a buffer on a page completely.
1453  */
discard_buffer(struct buffer_head * bh)1454 static void discard_buffer(struct buffer_head * bh)
1455 {
1456 	lock_buffer(bh);
1457 	clear_buffer_dirty(bh);
1458 	bh->b_bdev = NULL;
1459 	clear_buffer_mapped(bh);
1460 	clear_buffer_req(bh);
1461 	clear_buffer_new(bh);
1462 	clear_buffer_delay(bh);
1463 	clear_buffer_unwritten(bh);
1464 	unlock_buffer(bh);
1465 }
1466 
1467 /**
1468  * block_invalidatepage - invalidate part of all of a buffer-backed page
1469  *
1470  * @page: the page which is affected
1471  * @offset: the index of the truncation point
1472  *
1473  * block_invalidatepage() is called when all or part of the page has become
1474  * invalidatedby a truncate operation.
1475  *
1476  * block_invalidatepage() does not have to release all buffers, but it must
1477  * ensure that no dirty buffer is left outside @offset and that no I/O
1478  * is underway against any of the blocks which are outside the truncation
1479  * point.  Because the caller is about to free (and possibly reuse) those
1480  * blocks on-disk.
1481  */
block_invalidatepage(struct page * page,unsigned long offset)1482 void block_invalidatepage(struct page *page, unsigned long offset)
1483 {
1484 	struct buffer_head *head, *bh, *next;
1485 	unsigned int curr_off = 0;
1486 
1487 	BUG_ON(!PageLocked(page));
1488 	if (!page_has_buffers(page))
1489 		goto out;
1490 
1491 	head = page_buffers(page);
1492 	bh = head;
1493 	do {
1494 		unsigned int next_off = curr_off + bh->b_size;
1495 		next = bh->b_this_page;
1496 
1497 		/*
1498 		 * is this block fully invalidated?
1499 		 */
1500 		if (offset <= curr_off)
1501 			discard_buffer(bh);
1502 		curr_off = next_off;
1503 		bh = next;
1504 	} while (bh != head);
1505 
1506 	/*
1507 	 * We release buffers only if the entire page is being invalidated.
1508 	 * The get_block cached value has been unconditionally invalidated,
1509 	 * so real IO is not possible anymore.
1510 	 */
1511 	if (offset == 0)
1512 		try_to_release_page(page, 0);
1513 out:
1514 	return;
1515 }
1516 EXPORT_SYMBOL(block_invalidatepage);
1517 
1518 /*
1519  * We attach and possibly dirty the buffers atomically wrt
1520  * __set_page_dirty_buffers() via private_lock.  try_to_free_buffers
1521  * is already excluded via the page lock.
1522  */
create_empty_buffers(struct page * page,unsigned long blocksize,unsigned long b_state)1523 void create_empty_buffers(struct page *page,
1524 			unsigned long blocksize, unsigned long b_state)
1525 {
1526 	struct buffer_head *bh, *head, *tail;
1527 
1528 	head = alloc_page_buffers(page, blocksize, 1);
1529 	bh = head;
1530 	do {
1531 		bh->b_state |= b_state;
1532 		tail = bh;
1533 		bh = bh->b_this_page;
1534 	} while (bh);
1535 	tail->b_this_page = head;
1536 
1537 	spin_lock(&page->mapping->private_lock);
1538 	if (PageUptodate(page) || PageDirty(page)) {
1539 		bh = head;
1540 		do {
1541 			if (PageDirty(page))
1542 				set_buffer_dirty(bh);
1543 			if (PageUptodate(page))
1544 				set_buffer_uptodate(bh);
1545 			bh = bh->b_this_page;
1546 		} while (bh != head);
1547 	}
1548 	attach_page_buffers(page, head);
1549 	spin_unlock(&page->mapping->private_lock);
1550 }
1551 EXPORT_SYMBOL(create_empty_buffers);
1552 
1553 /*
1554  * We are taking a block for data and we don't want any output from any
1555  * buffer-cache aliases starting from return from that function and
1556  * until the moment when something will explicitly mark the buffer
1557  * dirty (hopefully that will not happen until we will free that block ;-)
1558  * We don't even need to mark it not-uptodate - nobody can expect
1559  * anything from a newly allocated buffer anyway. We used to used
1560  * unmap_buffer() for such invalidation, but that was wrong. We definitely
1561  * don't want to mark the alias unmapped, for example - it would confuse
1562  * anyone who might pick it with bread() afterwards...
1563  *
1564  * Also..  Note that bforget() doesn't lock the buffer.  So there can
1565  * be writeout I/O going on against recently-freed buffers.  We don't
1566  * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1567  * only if we really need to.  That happens here.
1568  */
unmap_underlying_metadata(struct block_device * bdev,sector_t block)1569 void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1570 {
1571 	struct buffer_head *old_bh;
1572 
1573 	might_sleep();
1574 
1575 	old_bh = __find_get_block_slow(bdev, block);
1576 	if (old_bh) {
1577 		clear_buffer_dirty(old_bh);
1578 		wait_on_buffer(old_bh);
1579 		clear_buffer_req(old_bh);
1580 		__brelse(old_bh);
1581 	}
1582 }
1583 EXPORT_SYMBOL(unmap_underlying_metadata);
1584 
1585 /*
1586  * NOTE! All mapped/uptodate combinations are valid:
1587  *
1588  *	Mapped	Uptodate	Meaning
1589  *
1590  *	No	No		"unknown" - must do get_block()
1591  *	No	Yes		"hole" - zero-filled
1592  *	Yes	No		"allocated" - allocated on disk, not read in
1593  *	Yes	Yes		"valid" - allocated and up-to-date in memory.
1594  *
1595  * "Dirty" is valid only with the last case (mapped+uptodate).
1596  */
1597 
1598 /*
1599  * While block_write_full_page is writing back the dirty buffers under
1600  * the page lock, whoever dirtied the buffers may decide to clean them
1601  * again at any time.  We handle that by only looking at the buffer
1602  * state inside lock_buffer().
1603  *
1604  * If block_write_full_page() is called for regular writeback
1605  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1606  * locked buffer.   This only can happen if someone has written the buffer
1607  * directly, with submit_bh().  At the address_space level PageWriteback
1608  * prevents this contention from occurring.
1609  *
1610  * If block_write_full_page() is called with wbc->sync_mode ==
1611  * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
1612  * causes the writes to be flagged as synchronous writes.
1613  */
__block_write_full_page(struct inode * inode,struct page * page,get_block_t * get_block,struct writeback_control * wbc,bh_end_io_t * handler)1614 static int __block_write_full_page(struct inode *inode, struct page *page,
1615 			get_block_t *get_block, struct writeback_control *wbc,
1616 			bh_end_io_t *handler)
1617 {
1618 	int err;
1619 	sector_t block;
1620 	sector_t last_block;
1621 	struct buffer_head *bh, *head;
1622 	const unsigned blocksize = 1 << inode->i_blkbits;
1623 	int nr_underway = 0;
1624 	int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1625 			WRITE_SYNC : WRITE);
1626 
1627 	BUG_ON(!PageLocked(page));
1628 
1629 	last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1630 
1631 	if (!page_has_buffers(page)) {
1632 		create_empty_buffers(page, blocksize,
1633 					(1 << BH_Dirty)|(1 << BH_Uptodate));
1634 	}
1635 
1636 	/*
1637 	 * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1638 	 * here, and the (potentially unmapped) buffers may become dirty at
1639 	 * any time.  If a buffer becomes dirty here after we've inspected it
1640 	 * then we just miss that fact, and the page stays dirty.
1641 	 *
1642 	 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1643 	 * handle that here by just cleaning them.
1644 	 */
1645 
1646 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1647 	head = page_buffers(page);
1648 	bh = head;
1649 
1650 	/*
1651 	 * Get all the dirty buffers mapped to disk addresses and
1652 	 * handle any aliases from the underlying blockdev's mapping.
1653 	 */
1654 	do {
1655 		if (block > last_block) {
1656 			/*
1657 			 * mapped buffers outside i_size will occur, because
1658 			 * this page can be outside i_size when there is a
1659 			 * truncate in progress.
1660 			 */
1661 			/*
1662 			 * The buffer was zeroed by block_write_full_page()
1663 			 */
1664 			clear_buffer_dirty(bh);
1665 			set_buffer_uptodate(bh);
1666 		} else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1667 			   buffer_dirty(bh)) {
1668 			WARN_ON(bh->b_size != blocksize);
1669 			err = get_block(inode, block, bh, 1);
1670 			if (err)
1671 				goto recover;
1672 			clear_buffer_delay(bh);
1673 			if (buffer_new(bh)) {
1674 				/* blockdev mappings never come here */
1675 				clear_buffer_new(bh);
1676 				unmap_underlying_metadata(bh->b_bdev,
1677 							bh->b_blocknr);
1678 			}
1679 		}
1680 		bh = bh->b_this_page;
1681 		block++;
1682 	} while (bh != head);
1683 
1684 	do {
1685 		if (!buffer_mapped(bh))
1686 			continue;
1687 		/*
1688 		 * If it's a fully non-blocking write attempt and we cannot
1689 		 * lock the buffer then redirty the page.  Note that this can
1690 		 * potentially cause a busy-wait loop from writeback threads
1691 		 * and kswapd activity, but those code paths have their own
1692 		 * higher-level throttling.
1693 		 */
1694 		if (wbc->sync_mode != WB_SYNC_NONE) {
1695 			lock_buffer(bh);
1696 		} else if (!trylock_buffer(bh)) {
1697 			redirty_page_for_writepage(wbc, page);
1698 			continue;
1699 		}
1700 		if (test_clear_buffer_dirty(bh)) {
1701 			mark_buffer_async_write_endio(bh, handler);
1702 		} else {
1703 			unlock_buffer(bh);
1704 		}
1705 	} while ((bh = bh->b_this_page) != head);
1706 
1707 	/*
1708 	 * The page and its buffers are protected by PageWriteback(), so we can
1709 	 * drop the bh refcounts early.
1710 	 */
1711 	BUG_ON(PageWriteback(page));
1712 	set_page_writeback(page);
1713 
1714 	do {
1715 		struct buffer_head *next = bh->b_this_page;
1716 		if (buffer_async_write(bh)) {
1717 			submit_bh(write_op, bh);
1718 			nr_underway++;
1719 		}
1720 		bh = next;
1721 	} while (bh != head);
1722 	unlock_page(page);
1723 
1724 	err = 0;
1725 done:
1726 	if (nr_underway == 0) {
1727 		/*
1728 		 * The page was marked dirty, but the buffers were
1729 		 * clean.  Someone wrote them back by hand with
1730 		 * ll_rw_block/submit_bh.  A rare case.
1731 		 */
1732 		end_page_writeback(page);
1733 
1734 		/*
1735 		 * The page and buffer_heads can be released at any time from
1736 		 * here on.
1737 		 */
1738 	}
1739 	return err;
1740 
1741 recover:
1742 	/*
1743 	 * ENOSPC, or some other error.  We may already have added some
1744 	 * blocks to the file, so we need to write these out to avoid
1745 	 * exposing stale data.
1746 	 * The page is currently locked and not marked for writeback
1747 	 */
1748 	bh = head;
1749 	/* Recovery: lock and submit the mapped buffers */
1750 	do {
1751 		if (buffer_mapped(bh) && buffer_dirty(bh) &&
1752 		    !buffer_delay(bh)) {
1753 			lock_buffer(bh);
1754 			mark_buffer_async_write_endio(bh, handler);
1755 		} else {
1756 			/*
1757 			 * The buffer may have been set dirty during
1758 			 * attachment to a dirty page.
1759 			 */
1760 			clear_buffer_dirty(bh);
1761 		}
1762 	} while ((bh = bh->b_this_page) != head);
1763 	SetPageError(page);
1764 	BUG_ON(PageWriteback(page));
1765 	mapping_set_error(page->mapping, err);
1766 	set_page_writeback(page);
1767 	do {
1768 		struct buffer_head *next = bh->b_this_page;
1769 		if (buffer_async_write(bh)) {
1770 			clear_buffer_dirty(bh);
1771 			submit_bh(write_op, bh);
1772 			nr_underway++;
1773 		}
1774 		bh = next;
1775 	} while (bh != head);
1776 	unlock_page(page);
1777 	goto done;
1778 }
1779 
1780 /*
1781  * If a page has any new buffers, zero them out here, and mark them uptodate
1782  * and dirty so they'll be written out (in order to prevent uninitialised
1783  * block data from leaking). And clear the new bit.
1784  */
page_zero_new_buffers(struct page * page,unsigned from,unsigned to)1785 void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1786 {
1787 	unsigned int block_start, block_end;
1788 	struct buffer_head *head, *bh;
1789 
1790 	BUG_ON(!PageLocked(page));
1791 	if (!page_has_buffers(page))
1792 		return;
1793 
1794 	bh = head = page_buffers(page);
1795 	block_start = 0;
1796 	do {
1797 		block_end = block_start + bh->b_size;
1798 
1799 		if (buffer_new(bh)) {
1800 			if (block_end > from && block_start < to) {
1801 				if (!PageUptodate(page)) {
1802 					unsigned start, size;
1803 
1804 					start = max(from, block_start);
1805 					size = min(to, block_end) - start;
1806 
1807 					zero_user(page, start, size);
1808 					set_buffer_uptodate(bh);
1809 				}
1810 
1811 				clear_buffer_new(bh);
1812 				mark_buffer_dirty(bh);
1813 			}
1814 		}
1815 
1816 		block_start = block_end;
1817 		bh = bh->b_this_page;
1818 	} while (bh != head);
1819 }
1820 EXPORT_SYMBOL(page_zero_new_buffers);
1821 
__block_write_begin(struct page * page,loff_t pos,unsigned len,get_block_t * get_block)1822 int __block_write_begin(struct page *page, loff_t pos, unsigned len,
1823 		get_block_t *get_block)
1824 {
1825 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1826 	unsigned to = from + len;
1827 	struct inode *inode = page->mapping->host;
1828 	unsigned block_start, block_end;
1829 	sector_t block;
1830 	int err = 0;
1831 	unsigned blocksize, bbits;
1832 	struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1833 
1834 	BUG_ON(!PageLocked(page));
1835 	BUG_ON(from > PAGE_CACHE_SIZE);
1836 	BUG_ON(to > PAGE_CACHE_SIZE);
1837 	BUG_ON(from > to);
1838 
1839 	blocksize = 1 << inode->i_blkbits;
1840 	if (!page_has_buffers(page))
1841 		create_empty_buffers(page, blocksize, 0);
1842 	head = page_buffers(page);
1843 
1844 	bbits = inode->i_blkbits;
1845 	block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1846 
1847 	for(bh = head, block_start = 0; bh != head || !block_start;
1848 	    block++, block_start=block_end, bh = bh->b_this_page) {
1849 		block_end = block_start + blocksize;
1850 		if (block_end <= from || block_start >= to) {
1851 			if (PageUptodate(page)) {
1852 				if (!buffer_uptodate(bh))
1853 					set_buffer_uptodate(bh);
1854 			}
1855 			continue;
1856 		}
1857 		if (buffer_new(bh))
1858 			clear_buffer_new(bh);
1859 		if (!buffer_mapped(bh)) {
1860 			WARN_ON(bh->b_size != blocksize);
1861 			err = get_block(inode, block, bh, 1);
1862 			if (err)
1863 				break;
1864 			if (buffer_new(bh)) {
1865 				unmap_underlying_metadata(bh->b_bdev,
1866 							bh->b_blocknr);
1867 				if (PageUptodate(page)) {
1868 					clear_buffer_new(bh);
1869 					set_buffer_uptodate(bh);
1870 					mark_buffer_dirty(bh);
1871 					continue;
1872 				}
1873 				if (block_end > to || block_start < from)
1874 					zero_user_segments(page,
1875 						to, block_end,
1876 						block_start, from);
1877 				continue;
1878 			}
1879 		}
1880 		if (PageUptodate(page)) {
1881 			if (!buffer_uptodate(bh))
1882 				set_buffer_uptodate(bh);
1883 			continue;
1884 		}
1885 		if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
1886 		    !buffer_unwritten(bh) &&
1887 		     (block_start < from || block_end > to)) {
1888 			ll_rw_block(READ, 1, &bh);
1889 			*wait_bh++=bh;
1890 		}
1891 	}
1892 	/*
1893 	 * If we issued read requests - let them complete.
1894 	 */
1895 	while(wait_bh > wait) {
1896 		wait_on_buffer(*--wait_bh);
1897 		if (!buffer_uptodate(*wait_bh))
1898 			err = -EIO;
1899 	}
1900 	if (unlikely(err)) {
1901 		page_zero_new_buffers(page, from, to);
1902 		ClearPageUptodate(page);
1903 	}
1904 	return err;
1905 }
1906 EXPORT_SYMBOL(__block_write_begin);
1907 
__block_commit_write(struct inode * inode,struct page * page,unsigned from,unsigned to)1908 static int __block_commit_write(struct inode *inode, struct page *page,
1909 		unsigned from, unsigned to)
1910 {
1911 	unsigned block_start, block_end;
1912 	int partial = 0;
1913 	unsigned blocksize;
1914 	struct buffer_head *bh, *head;
1915 
1916 	blocksize = 1 << inode->i_blkbits;
1917 
1918 	for(bh = head = page_buffers(page), block_start = 0;
1919 	    bh != head || !block_start;
1920 	    block_start=block_end, bh = bh->b_this_page) {
1921 		block_end = block_start + blocksize;
1922 		if (block_end <= from || block_start >= to) {
1923 			if (!buffer_uptodate(bh))
1924 				partial = 1;
1925 		} else {
1926 			set_buffer_uptodate(bh);
1927 			mark_buffer_dirty(bh);
1928 		}
1929 		clear_buffer_new(bh);
1930 	}
1931 
1932 	/*
1933 	 * If this is a partial write which happened to make all buffers
1934 	 * uptodate then we can optimize away a bogus readpage() for
1935 	 * the next read(). Here we 'discover' whether the page went
1936 	 * uptodate as a result of this (potentially partial) write.
1937 	 */
1938 	if (!partial)
1939 		SetPageUptodate(page);
1940 	return 0;
1941 }
1942 
1943 /*
1944  * block_write_begin takes care of the basic task of block allocation and
1945  * bringing partial write blocks uptodate first.
1946  *
1947  * The filesystem needs to handle block truncation upon failure.
1948  */
block_write_begin(struct address_space * mapping,loff_t pos,unsigned len,unsigned flags,struct page ** pagep,get_block_t * get_block)1949 int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
1950 		unsigned flags, struct page **pagep, get_block_t *get_block)
1951 {
1952 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1953 	struct page *page;
1954 	int status;
1955 
1956 	page = grab_cache_page_write_begin(mapping, index, flags);
1957 	if (!page)
1958 		return -ENOMEM;
1959 
1960 	status = __block_write_begin(page, pos, len, get_block);
1961 	if (unlikely(status)) {
1962 		unlock_page(page);
1963 		page_cache_release(page);
1964 		page = NULL;
1965 	}
1966 
1967 	*pagep = page;
1968 	return status;
1969 }
1970 EXPORT_SYMBOL(block_write_begin);
1971 
block_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct page * page,void * fsdata)1972 int block_write_end(struct file *file, struct address_space *mapping,
1973 			loff_t pos, unsigned len, unsigned copied,
1974 			struct page *page, void *fsdata)
1975 {
1976 	struct inode *inode = mapping->host;
1977 	unsigned start;
1978 
1979 	start = pos & (PAGE_CACHE_SIZE - 1);
1980 
1981 	if (unlikely(copied < len)) {
1982 		/*
1983 		 * The buffers that were written will now be uptodate, so we
1984 		 * don't have to worry about a readpage reading them and
1985 		 * overwriting a partial write. However if we have encountered
1986 		 * a short write and only partially written into a buffer, it
1987 		 * will not be marked uptodate, so a readpage might come in and
1988 		 * destroy our partial write.
1989 		 *
1990 		 * Do the simplest thing, and just treat any short write to a
1991 		 * non uptodate page as a zero-length write, and force the
1992 		 * caller to redo the whole thing.
1993 		 */
1994 		if (!PageUptodate(page))
1995 			copied = 0;
1996 
1997 		page_zero_new_buffers(page, start+copied, start+len);
1998 	}
1999 	flush_dcache_page(page);
2000 
2001 	/* This could be a short (even 0-length) commit */
2002 	__block_commit_write(inode, page, start, start+copied);
2003 
2004 	return copied;
2005 }
2006 EXPORT_SYMBOL(block_write_end);
2007 
generic_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct page * page,void * fsdata)2008 int generic_write_end(struct file *file, struct address_space *mapping,
2009 			loff_t pos, unsigned len, unsigned copied,
2010 			struct page *page, void *fsdata)
2011 {
2012 	struct inode *inode = mapping->host;
2013 	int i_size_changed = 0;
2014 
2015 	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2016 
2017 	/*
2018 	 * No need to use i_size_read() here, the i_size
2019 	 * cannot change under us because we hold i_mutex.
2020 	 *
2021 	 * But it's important to update i_size while still holding page lock:
2022 	 * page writeout could otherwise come in and zero beyond i_size.
2023 	 */
2024 	if (pos+copied > inode->i_size) {
2025 		i_size_write(inode, pos+copied);
2026 		i_size_changed = 1;
2027 	}
2028 
2029 	unlock_page(page);
2030 	page_cache_release(page);
2031 
2032 	/*
2033 	 * Don't mark the inode dirty under page lock. First, it unnecessarily
2034 	 * makes the holding time of page lock longer. Second, it forces lock
2035 	 * ordering of page lock and transaction start for journaling
2036 	 * filesystems.
2037 	 */
2038 	if (i_size_changed)
2039 		mark_inode_dirty(inode);
2040 
2041 	return copied;
2042 }
2043 EXPORT_SYMBOL(generic_write_end);
2044 
2045 /*
2046  * block_is_partially_uptodate checks whether buffers within a page are
2047  * uptodate or not.
2048  *
2049  * Returns true if all buffers which correspond to a file portion
2050  * we want to read are uptodate.
2051  */
block_is_partially_uptodate(struct page * page,read_descriptor_t * desc,unsigned long from)2052 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2053 					unsigned long from)
2054 {
2055 	struct inode *inode = page->mapping->host;
2056 	unsigned block_start, block_end, blocksize;
2057 	unsigned to;
2058 	struct buffer_head *bh, *head;
2059 	int ret = 1;
2060 
2061 	if (!page_has_buffers(page))
2062 		return 0;
2063 
2064 	blocksize = 1 << inode->i_blkbits;
2065 	to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2066 	to = from + to;
2067 	if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2068 		return 0;
2069 
2070 	head = page_buffers(page);
2071 	bh = head;
2072 	block_start = 0;
2073 	do {
2074 		block_end = block_start + blocksize;
2075 		if (block_end > from && block_start < to) {
2076 			if (!buffer_uptodate(bh)) {
2077 				ret = 0;
2078 				break;
2079 			}
2080 			if (block_end >= to)
2081 				break;
2082 		}
2083 		block_start = block_end;
2084 		bh = bh->b_this_page;
2085 	} while (bh != head);
2086 
2087 	return ret;
2088 }
2089 EXPORT_SYMBOL(block_is_partially_uptodate);
2090 
2091 /*
2092  * Generic "read page" function for block devices that have the normal
2093  * get_block functionality. This is most of the block device filesystems.
2094  * Reads the page asynchronously --- the unlock_buffer() and
2095  * set/clear_buffer_uptodate() functions propagate buffer state into the
2096  * page struct once IO has completed.
2097  */
block_read_full_page(struct page * page,get_block_t * get_block)2098 int block_read_full_page(struct page *page, get_block_t *get_block)
2099 {
2100 	struct inode *inode = page->mapping->host;
2101 	sector_t iblock, lblock;
2102 	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2103 	unsigned int blocksize;
2104 	int nr, i;
2105 	int fully_mapped = 1;
2106 
2107 	BUG_ON(!PageLocked(page));
2108 	blocksize = 1 << inode->i_blkbits;
2109 	if (!page_has_buffers(page))
2110 		create_empty_buffers(page, blocksize, 0);
2111 	head = page_buffers(page);
2112 
2113 	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2114 	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2115 	bh = head;
2116 	nr = 0;
2117 	i = 0;
2118 
2119 	do {
2120 		if (buffer_uptodate(bh))
2121 			continue;
2122 
2123 		if (!buffer_mapped(bh)) {
2124 			int err = 0;
2125 
2126 			fully_mapped = 0;
2127 			if (iblock < lblock) {
2128 				WARN_ON(bh->b_size != blocksize);
2129 				err = get_block(inode, iblock, bh, 0);
2130 				if (err)
2131 					SetPageError(page);
2132 			}
2133 			if (!buffer_mapped(bh)) {
2134 				zero_user(page, i * blocksize, blocksize);
2135 				if (!err)
2136 					set_buffer_uptodate(bh);
2137 				continue;
2138 			}
2139 			/*
2140 			 * get_block() might have updated the buffer
2141 			 * synchronously
2142 			 */
2143 			if (buffer_uptodate(bh))
2144 				continue;
2145 		}
2146 		arr[nr++] = bh;
2147 	} while (i++, iblock++, (bh = bh->b_this_page) != head);
2148 
2149 	if (fully_mapped)
2150 		SetPageMappedToDisk(page);
2151 
2152 	if (!nr) {
2153 		/*
2154 		 * All buffers are uptodate - we can set the page uptodate
2155 		 * as well. But not if get_block() returned an error.
2156 		 */
2157 		if (!PageError(page))
2158 			SetPageUptodate(page);
2159 		unlock_page(page);
2160 		return 0;
2161 	}
2162 
2163 	/* Stage two: lock the buffers */
2164 	for (i = 0; i < nr; i++) {
2165 		bh = arr[i];
2166 		lock_buffer(bh);
2167 		mark_buffer_async_read(bh);
2168 	}
2169 
2170 	/*
2171 	 * Stage 3: start the IO.  Check for uptodateness
2172 	 * inside the buffer lock in case another process reading
2173 	 * the underlying blockdev brought it uptodate (the sct fix).
2174 	 */
2175 	for (i = 0; i < nr; i++) {
2176 		bh = arr[i];
2177 		if (buffer_uptodate(bh))
2178 			end_buffer_async_read(bh, 1);
2179 		else
2180 			submit_bh(READ, bh);
2181 	}
2182 	return 0;
2183 }
2184 EXPORT_SYMBOL(block_read_full_page);
2185 
2186 /* utility function for filesystems that need to do work on expanding
2187  * truncates.  Uses filesystem pagecache writes to allow the filesystem to
2188  * deal with the hole.
2189  */
generic_cont_expand_simple(struct inode * inode,loff_t size)2190 int generic_cont_expand_simple(struct inode *inode, loff_t size)
2191 {
2192 	struct address_space *mapping = inode->i_mapping;
2193 	struct page *page;
2194 	void *fsdata;
2195 	int err;
2196 
2197 	err = inode_newsize_ok(inode, size);
2198 	if (err)
2199 		goto out;
2200 
2201 	err = pagecache_write_begin(NULL, mapping, size, 0,
2202 				AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2203 				&page, &fsdata);
2204 	if (err)
2205 		goto out;
2206 
2207 	err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2208 	BUG_ON(err > 0);
2209 
2210 out:
2211 	return err;
2212 }
2213 EXPORT_SYMBOL(generic_cont_expand_simple);
2214 
cont_expand_zero(struct file * file,struct address_space * mapping,loff_t pos,loff_t * bytes)2215 static int cont_expand_zero(struct file *file, struct address_space *mapping,
2216 			    loff_t pos, loff_t *bytes)
2217 {
2218 	struct inode *inode = mapping->host;
2219 	unsigned blocksize = 1 << inode->i_blkbits;
2220 	struct page *page;
2221 	void *fsdata;
2222 	pgoff_t index, curidx;
2223 	loff_t curpos;
2224 	unsigned zerofrom, offset, len;
2225 	int err = 0;
2226 
2227 	index = pos >> PAGE_CACHE_SHIFT;
2228 	offset = pos & ~PAGE_CACHE_MASK;
2229 
2230 	while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2231 		zerofrom = curpos & ~PAGE_CACHE_MASK;
2232 		if (zerofrom & (blocksize-1)) {
2233 			*bytes |= (blocksize-1);
2234 			(*bytes)++;
2235 		}
2236 		len = PAGE_CACHE_SIZE - zerofrom;
2237 
2238 		err = pagecache_write_begin(file, mapping, curpos, len,
2239 						AOP_FLAG_UNINTERRUPTIBLE,
2240 						&page, &fsdata);
2241 		if (err)
2242 			goto out;
2243 		zero_user(page, zerofrom, len);
2244 		err = pagecache_write_end(file, mapping, curpos, len, len,
2245 						page, fsdata);
2246 		if (err < 0)
2247 			goto out;
2248 		BUG_ON(err != len);
2249 		err = 0;
2250 
2251 		balance_dirty_pages_ratelimited(mapping);
2252 	}
2253 
2254 	/* page covers the boundary, find the boundary offset */
2255 	if (index == curidx) {
2256 		zerofrom = curpos & ~PAGE_CACHE_MASK;
2257 		/* if we will expand the thing last block will be filled */
2258 		if (offset <= zerofrom) {
2259 			goto out;
2260 		}
2261 		if (zerofrom & (blocksize-1)) {
2262 			*bytes |= (blocksize-1);
2263 			(*bytes)++;
2264 		}
2265 		len = offset - zerofrom;
2266 
2267 		err = pagecache_write_begin(file, mapping, curpos, len,
2268 						AOP_FLAG_UNINTERRUPTIBLE,
2269 						&page, &fsdata);
2270 		if (err)
2271 			goto out;
2272 		zero_user(page, zerofrom, len);
2273 		err = pagecache_write_end(file, mapping, curpos, len, len,
2274 						page, fsdata);
2275 		if (err < 0)
2276 			goto out;
2277 		BUG_ON(err != len);
2278 		err = 0;
2279 	}
2280 out:
2281 	return err;
2282 }
2283 
2284 /*
2285  * For moronic filesystems that do not allow holes in file.
2286  * We may have to extend the file.
2287  */
cont_write_begin(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned flags,struct page ** pagep,void ** fsdata,get_block_t * get_block,loff_t * bytes)2288 int cont_write_begin(struct file *file, struct address_space *mapping,
2289 			loff_t pos, unsigned len, unsigned flags,
2290 			struct page **pagep, void **fsdata,
2291 			get_block_t *get_block, loff_t *bytes)
2292 {
2293 	struct inode *inode = mapping->host;
2294 	unsigned blocksize = 1 << inode->i_blkbits;
2295 	unsigned zerofrom;
2296 	int err;
2297 
2298 	err = cont_expand_zero(file, mapping, pos, bytes);
2299 	if (err)
2300 		return err;
2301 
2302 	zerofrom = *bytes & ~PAGE_CACHE_MASK;
2303 	if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2304 		*bytes |= (blocksize-1);
2305 		(*bytes)++;
2306 	}
2307 
2308 	return block_write_begin(mapping, pos, len, flags, pagep, get_block);
2309 }
2310 EXPORT_SYMBOL(cont_write_begin);
2311 
block_commit_write(struct page * page,unsigned from,unsigned to)2312 int block_commit_write(struct page *page, unsigned from, unsigned to)
2313 {
2314 	struct inode *inode = page->mapping->host;
2315 	__block_commit_write(inode,page,from,to);
2316 	return 0;
2317 }
2318 EXPORT_SYMBOL(block_commit_write);
2319 
2320 /*
2321  * block_page_mkwrite() is not allowed to change the file size as it gets
2322  * called from a page fault handler when a page is first dirtied. Hence we must
2323  * be careful to check for EOF conditions here. We set the page up correctly
2324  * for a written page which means we get ENOSPC checking when writing into
2325  * holes and correct delalloc and unwritten extent mapping on filesystems that
2326  * support these features.
2327  *
2328  * We are not allowed to take the i_mutex here so we have to play games to
2329  * protect against truncate races as the page could now be beyond EOF.  Because
2330  * truncate writes the inode size before removing pages, once we have the
2331  * page lock we can determine safely if the page is beyond EOF. If it is not
2332  * beyond EOF, then the page is guaranteed safe against truncation until we
2333  * unlock the page.
2334  */
2335 int
block_page_mkwrite(struct vm_area_struct * vma,struct vm_fault * vmf,get_block_t get_block)2336 block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
2337 		   get_block_t get_block)
2338 {
2339 	struct page *page = vmf->page;
2340 	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2341 	unsigned long end;
2342 	loff_t size;
2343 	int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
2344 
2345 	lock_page(page);
2346 	size = i_size_read(inode);
2347 	if ((page->mapping != inode->i_mapping) ||
2348 	    (page_offset(page) > size)) {
2349 		/* page got truncated out from underneath us */
2350 		unlock_page(page);
2351 		goto out;
2352 	}
2353 
2354 	/* page is wholly or partially inside EOF */
2355 	if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2356 		end = size & ~PAGE_CACHE_MASK;
2357 	else
2358 		end = PAGE_CACHE_SIZE;
2359 
2360 	ret = __block_write_begin(page, 0, end, get_block);
2361 	if (!ret)
2362 		ret = block_commit_write(page, 0, end);
2363 
2364 	if (unlikely(ret)) {
2365 		unlock_page(page);
2366 		if (ret == -ENOMEM)
2367 			ret = VM_FAULT_OOM;
2368 		else /* -ENOSPC, -EIO, etc */
2369 			ret = VM_FAULT_SIGBUS;
2370 	} else
2371 		ret = VM_FAULT_LOCKED;
2372 
2373 out:
2374 	return ret;
2375 }
2376 EXPORT_SYMBOL(block_page_mkwrite);
2377 
2378 /*
2379  * nobh_write_begin()'s prereads are special: the buffer_heads are freed
2380  * immediately, while under the page lock.  So it needs a special end_io
2381  * handler which does not touch the bh after unlocking it.
2382  */
end_buffer_read_nobh(struct buffer_head * bh,int uptodate)2383 static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2384 {
2385 	__end_buffer_read_notouch(bh, uptodate);
2386 }
2387 
2388 /*
2389  * Attach the singly-linked list of buffers created by nobh_write_begin, to
2390  * the page (converting it to circular linked list and taking care of page
2391  * dirty races).
2392  */
attach_nobh_buffers(struct page * page,struct buffer_head * head)2393 static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2394 {
2395 	struct buffer_head *bh;
2396 
2397 	BUG_ON(!PageLocked(page));
2398 
2399 	spin_lock(&page->mapping->private_lock);
2400 	bh = head;
2401 	do {
2402 		if (PageDirty(page))
2403 			set_buffer_dirty(bh);
2404 		if (!bh->b_this_page)
2405 			bh->b_this_page = head;
2406 		bh = bh->b_this_page;
2407 	} while (bh != head);
2408 	attach_page_buffers(page, head);
2409 	spin_unlock(&page->mapping->private_lock);
2410 }
2411 
2412 /*
2413  * On entry, the page is fully not uptodate.
2414  * On exit the page is fully uptodate in the areas outside (from,to)
2415  * The filesystem needs to handle block truncation upon failure.
2416  */
nobh_write_begin(struct address_space * mapping,loff_t pos,unsigned len,unsigned flags,struct page ** pagep,void ** fsdata,get_block_t * get_block)2417 int nobh_write_begin(struct address_space *mapping,
2418 			loff_t pos, unsigned len, unsigned flags,
2419 			struct page **pagep, void **fsdata,
2420 			get_block_t *get_block)
2421 {
2422 	struct inode *inode = mapping->host;
2423 	const unsigned blkbits = inode->i_blkbits;
2424 	const unsigned blocksize = 1 << blkbits;
2425 	struct buffer_head *head, *bh;
2426 	struct page *page;
2427 	pgoff_t index;
2428 	unsigned from, to;
2429 	unsigned block_in_page;
2430 	unsigned block_start, block_end;
2431 	sector_t block_in_file;
2432 	int nr_reads = 0;
2433 	int ret = 0;
2434 	int is_mapped_to_disk = 1;
2435 
2436 	index = pos >> PAGE_CACHE_SHIFT;
2437 	from = pos & (PAGE_CACHE_SIZE - 1);
2438 	to = from + len;
2439 
2440 	page = grab_cache_page_write_begin(mapping, index, flags);
2441 	if (!page)
2442 		return -ENOMEM;
2443 	*pagep = page;
2444 	*fsdata = NULL;
2445 
2446 	if (page_has_buffers(page)) {
2447 		ret = __block_write_begin(page, pos, len, get_block);
2448 		if (unlikely(ret))
2449 			goto out_release;
2450 		return ret;
2451 	}
2452 
2453 	if (PageMappedToDisk(page))
2454 		return 0;
2455 
2456 	/*
2457 	 * Allocate buffers so that we can keep track of state, and potentially
2458 	 * attach them to the page if an error occurs. In the common case of
2459 	 * no error, they will just be freed again without ever being attached
2460 	 * to the page (which is all OK, because we're under the page lock).
2461 	 *
2462 	 * Be careful: the buffer linked list is a NULL terminated one, rather
2463 	 * than the circular one we're used to.
2464 	 */
2465 	head = alloc_page_buffers(page, blocksize, 0);
2466 	if (!head) {
2467 		ret = -ENOMEM;
2468 		goto out_release;
2469 	}
2470 
2471 	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
2472 
2473 	/*
2474 	 * We loop across all blocks in the page, whether or not they are
2475 	 * part of the affected region.  This is so we can discover if the
2476 	 * page is fully mapped-to-disk.
2477 	 */
2478 	for (block_start = 0, block_in_page = 0, bh = head;
2479 		  block_start < PAGE_CACHE_SIZE;
2480 		  block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
2481 		int create;
2482 
2483 		block_end = block_start + blocksize;
2484 		bh->b_state = 0;
2485 		create = 1;
2486 		if (block_start >= to)
2487 			create = 0;
2488 		ret = get_block(inode, block_in_file + block_in_page,
2489 					bh, create);
2490 		if (ret)
2491 			goto failed;
2492 		if (!buffer_mapped(bh))
2493 			is_mapped_to_disk = 0;
2494 		if (buffer_new(bh))
2495 			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2496 		if (PageUptodate(page)) {
2497 			set_buffer_uptodate(bh);
2498 			continue;
2499 		}
2500 		if (buffer_new(bh) || !buffer_mapped(bh)) {
2501 			zero_user_segments(page, block_start, from,
2502 							to, block_end);
2503 			continue;
2504 		}
2505 		if (buffer_uptodate(bh))
2506 			continue;	/* reiserfs does this */
2507 		if (block_start < from || block_end > to) {
2508 			lock_buffer(bh);
2509 			bh->b_end_io = end_buffer_read_nobh;
2510 			submit_bh(READ, bh);
2511 			nr_reads++;
2512 		}
2513 	}
2514 
2515 	if (nr_reads) {
2516 		/*
2517 		 * The page is locked, so these buffers are protected from
2518 		 * any VM or truncate activity.  Hence we don't need to care
2519 		 * for the buffer_head refcounts.
2520 		 */
2521 		for (bh = head; bh; bh = bh->b_this_page) {
2522 			wait_on_buffer(bh);
2523 			if (!buffer_uptodate(bh))
2524 				ret = -EIO;
2525 		}
2526 		if (ret)
2527 			goto failed;
2528 	}
2529 
2530 	if (is_mapped_to_disk)
2531 		SetPageMappedToDisk(page);
2532 
2533 	*fsdata = head; /* to be released by nobh_write_end */
2534 
2535 	return 0;
2536 
2537 failed:
2538 	BUG_ON(!ret);
2539 	/*
2540 	 * Error recovery is a bit difficult. We need to zero out blocks that
2541 	 * were newly allocated, and dirty them to ensure they get written out.
2542 	 * Buffers need to be attached to the page at this point, otherwise
2543 	 * the handling of potential IO errors during writeout would be hard
2544 	 * (could try doing synchronous writeout, but what if that fails too?)
2545 	 */
2546 	attach_nobh_buffers(page, head);
2547 	page_zero_new_buffers(page, from, to);
2548 
2549 out_release:
2550 	unlock_page(page);
2551 	page_cache_release(page);
2552 	*pagep = NULL;
2553 
2554 	return ret;
2555 }
2556 EXPORT_SYMBOL(nobh_write_begin);
2557 
nobh_write_end(struct file * file,struct address_space * mapping,loff_t pos,unsigned len,unsigned copied,struct page * page,void * fsdata)2558 int nobh_write_end(struct file *file, struct address_space *mapping,
2559 			loff_t pos, unsigned len, unsigned copied,
2560 			struct page *page, void *fsdata)
2561 {
2562 	struct inode *inode = page->mapping->host;
2563 	struct buffer_head *head = fsdata;
2564 	struct buffer_head *bh;
2565 	BUG_ON(fsdata != NULL && page_has_buffers(page));
2566 
2567 	if (unlikely(copied < len) && head)
2568 		attach_nobh_buffers(page, head);
2569 	if (page_has_buffers(page))
2570 		return generic_write_end(file, mapping, pos, len,
2571 					copied, page, fsdata);
2572 
2573 	SetPageUptodate(page);
2574 	set_page_dirty(page);
2575 	if (pos+copied > inode->i_size) {
2576 		i_size_write(inode, pos+copied);
2577 		mark_inode_dirty(inode);
2578 	}
2579 
2580 	unlock_page(page);
2581 	page_cache_release(page);
2582 
2583 	while (head) {
2584 		bh = head;
2585 		head = head->b_this_page;
2586 		free_buffer_head(bh);
2587 	}
2588 
2589 	return copied;
2590 }
2591 EXPORT_SYMBOL(nobh_write_end);
2592 
2593 /*
2594  * nobh_writepage() - based on block_full_write_page() except
2595  * that it tries to operate without attaching bufferheads to
2596  * the page.
2597  */
nobh_writepage(struct page * page,get_block_t * get_block,struct writeback_control * wbc)2598 int nobh_writepage(struct page *page, get_block_t *get_block,
2599 			struct writeback_control *wbc)
2600 {
2601 	struct inode * const inode = page->mapping->host;
2602 	loff_t i_size = i_size_read(inode);
2603 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2604 	unsigned offset;
2605 	int ret;
2606 
2607 	/* Is the page fully inside i_size? */
2608 	if (page->index < end_index)
2609 		goto out;
2610 
2611 	/* Is the page fully outside i_size? (truncate in progress) */
2612 	offset = i_size & (PAGE_CACHE_SIZE-1);
2613 	if (page->index >= end_index+1 || !offset) {
2614 		/*
2615 		 * The page may have dirty, unmapped buffers.  For example,
2616 		 * they may have been added in ext3_writepage().  Make them
2617 		 * freeable here, so the page does not leak.
2618 		 */
2619 #if 0
2620 		/* Not really sure about this  - do we need this ? */
2621 		if (page->mapping->a_ops->invalidatepage)
2622 			page->mapping->a_ops->invalidatepage(page, offset);
2623 #endif
2624 		unlock_page(page);
2625 		return 0; /* don't care */
2626 	}
2627 
2628 	/*
2629 	 * The page straddles i_size.  It must be zeroed out on each and every
2630 	 * writepage invocation because it may be mmapped.  "A file is mapped
2631 	 * in multiples of the page size.  For a file that is not a multiple of
2632 	 * the  page size, the remaining memory is zeroed when mapped, and
2633 	 * writes to that region are not written out to the file."
2634 	 */
2635 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2636 out:
2637 	ret = mpage_writepage(page, get_block, wbc);
2638 	if (ret == -EAGAIN)
2639 		ret = __block_write_full_page(inode, page, get_block, wbc,
2640 					      end_buffer_async_write);
2641 	return ret;
2642 }
2643 EXPORT_SYMBOL(nobh_writepage);
2644 
nobh_truncate_page(struct address_space * mapping,loff_t from,get_block_t * get_block)2645 int nobh_truncate_page(struct address_space *mapping,
2646 			loff_t from, get_block_t *get_block)
2647 {
2648 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2649 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2650 	unsigned blocksize;
2651 	sector_t iblock;
2652 	unsigned length, pos;
2653 	struct inode *inode = mapping->host;
2654 	struct page *page;
2655 	struct buffer_head map_bh;
2656 	int err;
2657 
2658 	blocksize = 1 << inode->i_blkbits;
2659 	length = offset & (blocksize - 1);
2660 
2661 	/* Block boundary? Nothing to do */
2662 	if (!length)
2663 		return 0;
2664 
2665 	length = blocksize - length;
2666 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2667 
2668 	page = grab_cache_page(mapping, index);
2669 	err = -ENOMEM;
2670 	if (!page)
2671 		goto out;
2672 
2673 	if (page_has_buffers(page)) {
2674 has_buffers:
2675 		unlock_page(page);
2676 		page_cache_release(page);
2677 		return block_truncate_page(mapping, from, get_block);
2678 	}
2679 
2680 	/* Find the buffer that contains "offset" */
2681 	pos = blocksize;
2682 	while (offset >= pos) {
2683 		iblock++;
2684 		pos += blocksize;
2685 	}
2686 
2687 	map_bh.b_size = blocksize;
2688 	map_bh.b_state = 0;
2689 	err = get_block(inode, iblock, &map_bh, 0);
2690 	if (err)
2691 		goto unlock;
2692 	/* unmapped? It's a hole - nothing to do */
2693 	if (!buffer_mapped(&map_bh))
2694 		goto unlock;
2695 
2696 	/* Ok, it's mapped. Make sure it's up-to-date */
2697 	if (!PageUptodate(page)) {
2698 		err = mapping->a_ops->readpage(NULL, page);
2699 		if (err) {
2700 			page_cache_release(page);
2701 			goto out;
2702 		}
2703 		lock_page(page);
2704 		if (!PageUptodate(page)) {
2705 			err = -EIO;
2706 			goto unlock;
2707 		}
2708 		if (page_has_buffers(page))
2709 			goto has_buffers;
2710 	}
2711 	zero_user(page, offset, length);
2712 	set_page_dirty(page);
2713 	err = 0;
2714 
2715 unlock:
2716 	unlock_page(page);
2717 	page_cache_release(page);
2718 out:
2719 	return err;
2720 }
2721 EXPORT_SYMBOL(nobh_truncate_page);
2722 
block_truncate_page(struct address_space * mapping,loff_t from,get_block_t * get_block)2723 int block_truncate_page(struct address_space *mapping,
2724 			loff_t from, get_block_t *get_block)
2725 {
2726 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
2727 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
2728 	unsigned blocksize;
2729 	sector_t iblock;
2730 	unsigned length, pos;
2731 	struct inode *inode = mapping->host;
2732 	struct page *page;
2733 	struct buffer_head *bh;
2734 	int err;
2735 
2736 	blocksize = 1 << inode->i_blkbits;
2737 	length = offset & (blocksize - 1);
2738 
2739 	/* Block boundary? Nothing to do */
2740 	if (!length)
2741 		return 0;
2742 
2743 	length = blocksize - length;
2744 	iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2745 
2746 	page = grab_cache_page(mapping, index);
2747 	err = -ENOMEM;
2748 	if (!page)
2749 		goto out;
2750 
2751 	if (!page_has_buffers(page))
2752 		create_empty_buffers(page, blocksize, 0);
2753 
2754 	/* Find the buffer that contains "offset" */
2755 	bh = page_buffers(page);
2756 	pos = blocksize;
2757 	while (offset >= pos) {
2758 		bh = bh->b_this_page;
2759 		iblock++;
2760 		pos += blocksize;
2761 	}
2762 
2763 	err = 0;
2764 	if (!buffer_mapped(bh)) {
2765 		WARN_ON(bh->b_size != blocksize);
2766 		err = get_block(inode, iblock, bh, 0);
2767 		if (err)
2768 			goto unlock;
2769 		/* unmapped? It's a hole - nothing to do */
2770 		if (!buffer_mapped(bh))
2771 			goto unlock;
2772 	}
2773 
2774 	/* Ok, it's mapped. Make sure it's up-to-date */
2775 	if (PageUptodate(page))
2776 		set_buffer_uptodate(bh);
2777 
2778 	if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
2779 		err = -EIO;
2780 		ll_rw_block(READ, 1, &bh);
2781 		wait_on_buffer(bh);
2782 		/* Uhhuh. Read error. Complain and punt. */
2783 		if (!buffer_uptodate(bh))
2784 			goto unlock;
2785 	}
2786 
2787 	zero_user(page, offset, length);
2788 	mark_buffer_dirty(bh);
2789 	err = 0;
2790 
2791 unlock:
2792 	unlock_page(page);
2793 	page_cache_release(page);
2794 out:
2795 	return err;
2796 }
2797 EXPORT_SYMBOL(block_truncate_page);
2798 
2799 /*
2800  * The generic ->writepage function for buffer-backed address_spaces
2801  * this form passes in the end_io handler used to finish the IO.
2802  */
block_write_full_page_endio(struct page * page,get_block_t * get_block,struct writeback_control * wbc,bh_end_io_t * handler)2803 int block_write_full_page_endio(struct page *page, get_block_t *get_block,
2804 			struct writeback_control *wbc, bh_end_io_t *handler)
2805 {
2806 	struct inode * const inode = page->mapping->host;
2807 	loff_t i_size = i_size_read(inode);
2808 	const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2809 	unsigned offset;
2810 
2811 	/* Is the page fully inside i_size? */
2812 	if (page->index < end_index)
2813 		return __block_write_full_page(inode, page, get_block, wbc,
2814 					       handler);
2815 
2816 	/* Is the page fully outside i_size? (truncate in progress) */
2817 	offset = i_size & (PAGE_CACHE_SIZE-1);
2818 	if (page->index >= end_index+1 || !offset) {
2819 		/*
2820 		 * The page may have dirty, unmapped buffers.  For example,
2821 		 * they may have been added in ext3_writepage().  Make them
2822 		 * freeable here, so the page does not leak.
2823 		 */
2824 		do_invalidatepage(page, 0);
2825 		unlock_page(page);
2826 		return 0; /* don't care */
2827 	}
2828 
2829 	/*
2830 	 * The page straddles i_size.  It must be zeroed out on each and every
2831 	 * writepage invocation because it may be mmapped.  "A file is mapped
2832 	 * in multiples of the page size.  For a file that is not a multiple of
2833 	 * the  page size, the remaining memory is zeroed when mapped, and
2834 	 * writes to that region are not written out to the file."
2835 	 */
2836 	zero_user_segment(page, offset, PAGE_CACHE_SIZE);
2837 	return __block_write_full_page(inode, page, get_block, wbc, handler);
2838 }
2839 EXPORT_SYMBOL(block_write_full_page_endio);
2840 
2841 /*
2842  * The generic ->writepage function for buffer-backed address_spaces
2843  */
block_write_full_page(struct page * page,get_block_t * get_block,struct writeback_control * wbc)2844 int block_write_full_page(struct page *page, get_block_t *get_block,
2845 			struct writeback_control *wbc)
2846 {
2847 	return block_write_full_page_endio(page, get_block, wbc,
2848 					   end_buffer_async_write);
2849 }
2850 EXPORT_SYMBOL(block_write_full_page);
2851 
generic_block_bmap(struct address_space * mapping,sector_t block,get_block_t * get_block)2852 sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2853 			    get_block_t *get_block)
2854 {
2855 	struct buffer_head tmp;
2856 	struct inode *inode = mapping->host;
2857 	tmp.b_state = 0;
2858 	tmp.b_blocknr = 0;
2859 	tmp.b_size = 1 << inode->i_blkbits;
2860 	get_block(inode, block, &tmp, 0);
2861 	return tmp.b_blocknr;
2862 }
2863 EXPORT_SYMBOL(generic_block_bmap);
2864 
end_bio_bh_io_sync(struct bio * bio,int err)2865 static void end_bio_bh_io_sync(struct bio *bio, int err)
2866 {
2867 	struct buffer_head *bh = bio->bi_private;
2868 
2869 	if (err == -EOPNOTSUPP) {
2870 		set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2871 	}
2872 
2873 	if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
2874 		set_bit(BH_Quiet, &bh->b_state);
2875 
2876 	bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2877 	bio_put(bio);
2878 }
2879 
submit_bh(int rw,struct buffer_head * bh)2880 int submit_bh(int rw, struct buffer_head * bh)
2881 {
2882 	struct bio *bio;
2883 	int ret = 0;
2884 
2885 	BUG_ON(!buffer_locked(bh));
2886 	BUG_ON(!buffer_mapped(bh));
2887 	BUG_ON(!bh->b_end_io);
2888 	BUG_ON(buffer_delay(bh));
2889 	BUG_ON(buffer_unwritten(bh));
2890 
2891 	/*
2892 	 * Only clear out a write error when rewriting
2893 	 */
2894 	if (test_set_buffer_req(bh) && (rw & WRITE))
2895 		clear_buffer_write_io_error(bh);
2896 
2897 	/*
2898 	 * from here on down, it's all bio -- do the initial mapping,
2899 	 * submit_bio -> generic_make_request may further map this bio around
2900 	 */
2901 	bio = bio_alloc(GFP_NOIO, 1);
2902 
2903 	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2904 	bio->bi_bdev = bh->b_bdev;
2905 	bio->bi_io_vec[0].bv_page = bh->b_page;
2906 	bio->bi_io_vec[0].bv_len = bh->b_size;
2907 	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2908 
2909 	bio->bi_vcnt = 1;
2910 	bio->bi_idx = 0;
2911 	bio->bi_size = bh->b_size;
2912 
2913 	bio->bi_end_io = end_bio_bh_io_sync;
2914 	bio->bi_private = bh;
2915 
2916 	bio_get(bio);
2917 	submit_bio(rw, bio);
2918 
2919 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2920 		ret = -EOPNOTSUPP;
2921 
2922 	bio_put(bio);
2923 	return ret;
2924 }
2925 EXPORT_SYMBOL(submit_bh);
2926 
2927 /**
2928  * ll_rw_block: low-level access to block devices (DEPRECATED)
2929  * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
2930  * @nr: number of &struct buffer_heads in the array
2931  * @bhs: array of pointers to &struct buffer_head
2932  *
2933  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2934  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
2935  * %READA option is described in the documentation for generic_make_request()
2936  * which ll_rw_block() calls.
2937  *
2938  * This function drops any buffer that it cannot get a lock on (with the
2939  * BH_Lock state bit), any buffer that appears to be clean when doing a write
2940  * request, and any buffer that appears to be up-to-date when doing read
2941  * request.  Further it marks as clean buffers that are processed for
2942  * writing (the buffer cache won't assume that they are actually clean
2943  * until the buffer gets unlocked).
2944  *
2945  * ll_rw_block sets b_end_io to simple completion handler that marks
2946  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2947  * any waiters.
2948  *
2949  * All of the buffers must be for the same device, and must also be a
2950  * multiple of the current approved size for the device.
2951  */
ll_rw_block(int rw,int nr,struct buffer_head * bhs[])2952 void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2953 {
2954 	int i;
2955 
2956 	for (i = 0; i < nr; i++) {
2957 		struct buffer_head *bh = bhs[i];
2958 
2959 		if (!trylock_buffer(bh))
2960 			continue;
2961 		if (rw == WRITE) {
2962 			if (test_clear_buffer_dirty(bh)) {
2963 				bh->b_end_io = end_buffer_write_sync;
2964 				get_bh(bh);
2965 				submit_bh(WRITE, bh);
2966 				continue;
2967 			}
2968 		} else {
2969 			if (!buffer_uptodate(bh)) {
2970 				bh->b_end_io = end_buffer_read_sync;
2971 				get_bh(bh);
2972 				submit_bh(rw, bh);
2973 				continue;
2974 			}
2975 		}
2976 		unlock_buffer(bh);
2977 	}
2978 }
2979 EXPORT_SYMBOL(ll_rw_block);
2980 
write_dirty_buffer(struct buffer_head * bh,int rw)2981 void write_dirty_buffer(struct buffer_head *bh, int rw)
2982 {
2983 	lock_buffer(bh);
2984 	if (!test_clear_buffer_dirty(bh)) {
2985 		unlock_buffer(bh);
2986 		return;
2987 	}
2988 	bh->b_end_io = end_buffer_write_sync;
2989 	get_bh(bh);
2990 	submit_bh(rw, bh);
2991 }
2992 EXPORT_SYMBOL(write_dirty_buffer);
2993 
2994 /*
2995  * For a data-integrity writeout, we need to wait upon any in-progress I/O
2996  * and then start new I/O and then wait upon it.  The caller must have a ref on
2997  * the buffer_head.
2998  */
__sync_dirty_buffer(struct buffer_head * bh,int rw)2999 int __sync_dirty_buffer(struct buffer_head *bh, int rw)
3000 {
3001 	int ret = 0;
3002 
3003 	WARN_ON(atomic_read(&bh->b_count) < 1);
3004 	lock_buffer(bh);
3005 	if (test_clear_buffer_dirty(bh)) {
3006 		get_bh(bh);
3007 		bh->b_end_io = end_buffer_write_sync;
3008 		ret = submit_bh(rw, bh);
3009 		wait_on_buffer(bh);
3010 		if (!ret && !buffer_uptodate(bh))
3011 			ret = -EIO;
3012 	} else {
3013 		unlock_buffer(bh);
3014 	}
3015 	return ret;
3016 }
3017 EXPORT_SYMBOL(__sync_dirty_buffer);
3018 
sync_dirty_buffer(struct buffer_head * bh)3019 int sync_dirty_buffer(struct buffer_head *bh)
3020 {
3021 	return __sync_dirty_buffer(bh, WRITE_SYNC);
3022 }
3023 EXPORT_SYMBOL(sync_dirty_buffer);
3024 
3025 /*
3026  * try_to_free_buffers() checks if all the buffers on this particular page
3027  * are unused, and releases them if so.
3028  *
3029  * Exclusion against try_to_free_buffers may be obtained by either
3030  * locking the page or by holding its mapping's private_lock.
3031  *
3032  * If the page is dirty but all the buffers are clean then we need to
3033  * be sure to mark the page clean as well.  This is because the page
3034  * may be against a block device, and a later reattachment of buffers
3035  * to a dirty page will set *all* buffers dirty.  Which would corrupt
3036  * filesystem data on the same device.
3037  *
3038  * The same applies to regular filesystem pages: if all the buffers are
3039  * clean then we set the page clean and proceed.  To do that, we require
3040  * total exclusion from __set_page_dirty_buffers().  That is obtained with
3041  * private_lock.
3042  *
3043  * try_to_free_buffers() is non-blocking.
3044  */
buffer_busy(struct buffer_head * bh)3045 static inline int buffer_busy(struct buffer_head *bh)
3046 {
3047 	return atomic_read(&bh->b_count) |
3048 		(bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3049 }
3050 
3051 static int
drop_buffers(struct page * page,struct buffer_head ** buffers_to_free)3052 drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3053 {
3054 	struct buffer_head *head = page_buffers(page);
3055 	struct buffer_head *bh;
3056 
3057 	bh = head;
3058 	do {
3059 		if (buffer_write_io_error(bh) && page->mapping)
3060 			set_bit(AS_EIO, &page->mapping->flags);
3061 		if (buffer_busy(bh))
3062 			goto failed;
3063 		bh = bh->b_this_page;
3064 	} while (bh != head);
3065 
3066 	do {
3067 		struct buffer_head *next = bh->b_this_page;
3068 
3069 		if (bh->b_assoc_map)
3070 			__remove_assoc_queue(bh);
3071 		bh = next;
3072 	} while (bh != head);
3073 	*buffers_to_free = head;
3074 	__clear_page_buffers(page);
3075 	return 1;
3076 failed:
3077 	return 0;
3078 }
3079 
try_to_free_buffers(struct page * page)3080 int try_to_free_buffers(struct page *page)
3081 {
3082 	struct address_space * const mapping = page->mapping;
3083 	struct buffer_head *buffers_to_free = NULL;
3084 	int ret = 0;
3085 
3086 	BUG_ON(!PageLocked(page));
3087 	if (PageWriteback(page))
3088 		return 0;
3089 
3090 	if (mapping == NULL) {		/* can this still happen? */
3091 		ret = drop_buffers(page, &buffers_to_free);
3092 		goto out;
3093 	}
3094 
3095 	spin_lock(&mapping->private_lock);
3096 	ret = drop_buffers(page, &buffers_to_free);
3097 
3098 	/*
3099 	 * If the filesystem writes its buffers by hand (eg ext3)
3100 	 * then we can have clean buffers against a dirty page.  We
3101 	 * clean the page here; otherwise the VM will never notice
3102 	 * that the filesystem did any IO at all.
3103 	 *
3104 	 * Also, during truncate, discard_buffer will have marked all
3105 	 * the page's buffers clean.  We discover that here and clean
3106 	 * the page also.
3107 	 *
3108 	 * private_lock must be held over this entire operation in order
3109 	 * to synchronise against __set_page_dirty_buffers and prevent the
3110 	 * dirty bit from being lost.
3111 	 */
3112 	if (ret)
3113 		cancel_dirty_page(page, PAGE_CACHE_SIZE);
3114 	spin_unlock(&mapping->private_lock);
3115 out:
3116 	if (buffers_to_free) {
3117 		struct buffer_head *bh = buffers_to_free;
3118 
3119 		do {
3120 			struct buffer_head *next = bh->b_this_page;
3121 			free_buffer_head(bh);
3122 			bh = next;
3123 		} while (bh != buffers_to_free);
3124 	}
3125 	return ret;
3126 }
3127 EXPORT_SYMBOL(try_to_free_buffers);
3128 
3129 /*
3130  * There are no bdflush tunables left.  But distributions are
3131  * still running obsolete flush daemons, so we terminate them here.
3132  *
3133  * Use of bdflush() is deprecated and will be removed in a future kernel.
3134  * The `flush-X' kernel threads fully replace bdflush daemons and this call.
3135  */
SYSCALL_DEFINE2(bdflush,int,func,long,data)3136 SYSCALL_DEFINE2(bdflush, int, func, long, data)
3137 {
3138 	static int msg_count;
3139 
3140 	if (!capable(CAP_SYS_ADMIN))
3141 		return -EPERM;
3142 
3143 	if (msg_count < 5) {
3144 		msg_count++;
3145 		printk(KERN_INFO
3146 			"warning: process `%s' used the obsolete bdflush"
3147 			" system call\n", current->comm);
3148 		printk(KERN_INFO "Fix your initscripts?\n");
3149 	}
3150 
3151 	if (func == 1)
3152 		do_exit(0);
3153 	return 0;
3154 }
3155 
3156 /*
3157  * Buffer-head allocation
3158  */
3159 static struct kmem_cache *bh_cachep;
3160 
3161 /*
3162  * Once the number of bh's in the machine exceeds this level, we start
3163  * stripping them in writeback.
3164  */
3165 static int max_buffer_heads;
3166 
3167 int buffer_heads_over_limit;
3168 
3169 struct bh_accounting {
3170 	int nr;			/* Number of live bh's */
3171 	int ratelimit;		/* Limit cacheline bouncing */
3172 };
3173 
3174 static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3175 
recalc_bh_state(void)3176 static void recalc_bh_state(void)
3177 {
3178 	int i;
3179 	int tot = 0;
3180 
3181 	if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
3182 		return;
3183 	__this_cpu_write(bh_accounting.ratelimit, 0);
3184 	for_each_online_cpu(i)
3185 		tot += per_cpu(bh_accounting, i).nr;
3186 	buffer_heads_over_limit = (tot > max_buffer_heads);
3187 }
3188 
alloc_buffer_head(gfp_t gfp_flags)3189 struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
3190 {
3191 	struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
3192 	if (ret) {
3193 		INIT_LIST_HEAD(&ret->b_assoc_buffers);
3194 		preempt_disable();
3195 		__this_cpu_inc(bh_accounting.nr);
3196 		recalc_bh_state();
3197 		preempt_enable();
3198 	}
3199 	return ret;
3200 }
3201 EXPORT_SYMBOL(alloc_buffer_head);
3202 
free_buffer_head(struct buffer_head * bh)3203 void free_buffer_head(struct buffer_head *bh)
3204 {
3205 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
3206 	kmem_cache_free(bh_cachep, bh);
3207 	preempt_disable();
3208 	__this_cpu_dec(bh_accounting.nr);
3209 	recalc_bh_state();
3210 	preempt_enable();
3211 }
3212 EXPORT_SYMBOL(free_buffer_head);
3213 
buffer_exit_cpu(int cpu)3214 static void buffer_exit_cpu(int cpu)
3215 {
3216 	int i;
3217 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3218 
3219 	for (i = 0; i < BH_LRU_SIZE; i++) {
3220 		brelse(b->bhs[i]);
3221 		b->bhs[i] = NULL;
3222 	}
3223 	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
3224 	per_cpu(bh_accounting, cpu).nr = 0;
3225 }
3226 
buffer_cpu_notify(struct notifier_block * self,unsigned long action,void * hcpu)3227 static int buffer_cpu_notify(struct notifier_block *self,
3228 			      unsigned long action, void *hcpu)
3229 {
3230 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
3231 		buffer_exit_cpu((unsigned long)hcpu);
3232 	return NOTIFY_OK;
3233 }
3234 
3235 /**
3236  * bh_uptodate_or_lock - Test whether the buffer is uptodate
3237  * @bh: struct buffer_head
3238  *
3239  * Return true if the buffer is up-to-date and false,
3240  * with the buffer locked, if not.
3241  */
bh_uptodate_or_lock(struct buffer_head * bh)3242 int bh_uptodate_or_lock(struct buffer_head *bh)
3243 {
3244 	if (!buffer_uptodate(bh)) {
3245 		lock_buffer(bh);
3246 		if (!buffer_uptodate(bh))
3247 			return 0;
3248 		unlock_buffer(bh);
3249 	}
3250 	return 1;
3251 }
3252 EXPORT_SYMBOL(bh_uptodate_or_lock);
3253 
3254 /**
3255  * bh_submit_read - Submit a locked buffer for reading
3256  * @bh: struct buffer_head
3257  *
3258  * Returns zero on success and -EIO on error.
3259  */
bh_submit_read(struct buffer_head * bh)3260 int bh_submit_read(struct buffer_head *bh)
3261 {
3262 	BUG_ON(!buffer_locked(bh));
3263 
3264 	if (buffer_uptodate(bh)) {
3265 		unlock_buffer(bh);
3266 		return 0;
3267 	}
3268 
3269 	get_bh(bh);
3270 	bh->b_end_io = end_buffer_read_sync;
3271 	submit_bh(READ, bh);
3272 	wait_on_buffer(bh);
3273 	if (buffer_uptodate(bh))
3274 		return 0;
3275 	return -EIO;
3276 }
3277 EXPORT_SYMBOL(bh_submit_read);
3278 
buffer_init(void)3279 void __init buffer_init(void)
3280 {
3281 	int nrpages;
3282 
3283 	bh_cachep = kmem_cache_create("buffer_head",
3284 			sizeof(struct buffer_head), 0,
3285 				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3286 				SLAB_MEM_SPREAD),
3287 				NULL);
3288 
3289 	/*
3290 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
3291 	 */
3292 	nrpages = (nr_free_buffer_pages() * 10) / 100;
3293 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3294 	hotcpu_notifier(buffer_cpu_notify, 0);
3295 }
3296