1 /*
2  *	linux/mm/filemap.c
3  *
4  * Copyright (C) 1994-2006  Linus Torvalds
5  */
6 
7 /*
8  * This file handles the generic file mmap semantics used by
9  * most "normal" filesystems (but you don't /have/ to use this:
10  * the NFS filesystem used to do this differently, for example)
11  */
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/locks.h>
17 #include <linux/pagemap.h>
18 #include <linux/swap.h>
19 #include <linux/smp_lock.h>
20 #include <linux/blkdev.h>
21 #include <linux/file.h>
22 #include <linux/swapctl.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
25 #include <linux/iobuf.h>
26 
27 #include <asm/pgalloc.h>
28 #include <asm/uaccess.h>
29 #include <asm/mman.h>
30 
31 #include <linux/highmem.h>
32 
33 /*
34  * Shared mappings implemented 30.11.1994. It's not fully working yet,
35  * though.
36  *
37  * Shared mappings now work. 15.8.1995  Bruno.
38  *
39  * finished 'unifying' the page and buffer cache and SMP-threaded the
40  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41  *
42  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43  */
44 
45 unsigned long page_cache_size;
46 unsigned int page_hash_bits;
47 struct page **page_hash_table;
48 
49 int vm_max_readahead = 31;
50 int vm_min_readahead = 3;
51 EXPORT_SYMBOL(vm_max_readahead);
52 EXPORT_SYMBOL(vm_min_readahead);
53 
54 
55 spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
56 /*
57  * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
58  *	with the pagecache_lock held.
59  *
60  * Ordering:
61  *	swap_lock ->
62  *		pagemap_lru_lock ->
63  *			pagecache_lock
64  */
65 spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
66 
67 #define CLUSTER_PAGES		(1 << page_cluster)
68 #define CLUSTER_OFFSET(x)	(((x) >> page_cluster) << page_cluster)
69 
70 static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
add_page_to_hash_queue(struct page * page,struct page ** p)71 static void fastcall add_page_to_hash_queue(struct page * page, struct page **p)
72 {
73 	struct page *next = *p;
74 
75 	*p = page;
76 	page->next_hash = next;
77 	page->pprev_hash = p;
78 	if (next)
79 		next->pprev_hash = &page->next_hash;
80 	if (page->buffers)
81 		PAGE_BUG(page);
82 	inc_nr_cache_pages(page);
83 }
84 
add_page_to_inode_queue(struct address_space * mapping,struct page * page)85 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
86 {
87 	struct list_head *head = &mapping->clean_pages;
88 
89 	mapping->nrpages++;
90 	list_add(&page->list, head);
91 	page->mapping = mapping;
92 }
93 
remove_page_from_inode_queue(struct page * page)94 static inline void remove_page_from_inode_queue(struct page * page)
95 {
96 	struct address_space * mapping = page->mapping;
97 
98 	if (mapping->a_ops->removepage)
99 		mapping->a_ops->removepage(page);
100 
101 	list_del(&page->list);
102 	page->mapping = NULL;
103 	wmb();
104 	mapping->nrpages--;
105 	if (!mapping->nrpages)
106 		refile_inode(mapping->host);
107 }
108 
remove_page_from_hash_queue(struct page * page)109 static inline void remove_page_from_hash_queue(struct page * page)
110 {
111 	struct page *next = page->next_hash;
112 	struct page **pprev = page->pprev_hash;
113 
114 	if (next)
115 		next->pprev_hash = pprev;
116 	*pprev = next;
117 	page->pprev_hash = NULL;
118 	dec_nr_cache_pages(page);
119 }
120 
121 /*
122  * Remove a page from the page cache and free it. Caller has to make
123  * sure the page is locked and that nobody else uses it - or that usage
124  * is safe.
125  */
__remove_inode_page(struct page * page)126 void __remove_inode_page(struct page *page)
127 {
128 	remove_page_from_inode_queue(page);
129 	remove_page_from_hash_queue(page);
130 }
131 
remove_inode_page(struct page * page)132 void remove_inode_page(struct page *page)
133 {
134 	if (!PageLocked(page))
135 		PAGE_BUG(page);
136 
137 	spin_lock(&pagecache_lock);
138 	__remove_inode_page(page);
139 	spin_unlock(&pagecache_lock);
140 }
141 
sync_page(struct page * page)142 static inline int sync_page(struct page *page)
143 {
144 	struct address_space *mapping = page->mapping;
145 
146 	if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
147 		return mapping->a_ops->sync_page(page);
148 	return 0;
149 }
150 
151 /*
152  * Add a page to the dirty page list.
153  */
set_page_dirty(struct page * page)154 void fastcall set_page_dirty(struct page *page)
155 {
156 	if (!test_and_set_bit(PG_dirty, &page->flags)) {
157 		struct address_space *mapping = page->mapping;
158 
159 		if (mapping) {
160 			spin_lock(&pagecache_lock);
161 			mapping = page->mapping;
162 			if (mapping) {	/* may have been truncated */
163 				list_del(&page->list);
164 				list_add(&page->list, &mapping->dirty_pages);
165 			}
166 			spin_unlock(&pagecache_lock);
167 
168 			if (mapping && mapping->host)
169 				mark_inode_dirty_pages(mapping->host);
170 			if (block_dump)
171 				printk(KERN_DEBUG "%s: dirtied page\n", current->comm);
172 		}
173 	}
174 }
175 
176 /**
177  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
178  * @inode: the inode which pages we want to invalidate
179  *
180  * This function only removes the unlocked pages, if you want to
181  * remove all the pages of one inode, you must call truncate_inode_pages.
182  */
183 
invalidate_inode_pages(struct inode * inode)184 void invalidate_inode_pages(struct inode * inode)
185 {
186 	struct list_head *head, *curr;
187 	struct page * page;
188 
189 	head = &inode->i_mapping->clean_pages;
190 
191 	spin_lock(&pagemap_lru_lock);
192 	spin_lock(&pagecache_lock);
193 	curr = head->next;
194 
195 	while (curr != head) {
196 		page = list_entry(curr, struct page, list);
197 		curr = curr->next;
198 
199 		/* We cannot invalidate something in dirty.. */
200 		if (PageDirty(page))
201 			continue;
202 
203 		/* ..or locked */
204 		if (TryLockPage(page))
205 			continue;
206 
207 		if (page->buffers && !try_to_free_buffers(page, 0))
208 			goto unlock;
209 
210 		if (page_count(page) != 1)
211 			goto unlock;
212 
213 		__lru_cache_del(page);
214 		__remove_inode_page(page);
215 		UnlockPage(page);
216 		page_cache_release(page);
217 		continue;
218 unlock:
219 		UnlockPage(page);
220 		continue;
221 	}
222 
223 	spin_unlock(&pagecache_lock);
224 	spin_unlock(&pagemap_lru_lock);
225 }
226 
do_flushpage(struct page * page,unsigned long offset)227 static int do_flushpage(struct page *page, unsigned long offset)
228 {
229 	int (*flushpage) (struct page *, unsigned long);
230 	flushpage = page->mapping->a_ops->flushpage;
231 	if (flushpage)
232 		return (*flushpage)(page, offset);
233 	return block_flushpage(page, offset);
234 }
235 
truncate_partial_page(struct page * page,unsigned partial)236 static inline void truncate_partial_page(struct page *page, unsigned partial)
237 {
238 	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
239 	if (page->buffers)
240 		do_flushpage(page, partial);
241 }
242 
truncate_complete_page(struct page * page)243 static void truncate_complete_page(struct page *page)
244 {
245 	/* Leave it on the LRU if it gets converted into anonymous buffers */
246 	if (!page->buffers || do_flushpage(page, 0))
247 		lru_cache_del(page);
248 
249 	/*
250 	 * We remove the page from the page cache _after_ we have
251 	 * destroyed all buffer-cache references to it. Otherwise some
252 	 * other process might think this inode page is not in the
253 	 * page cache and creates a buffer-cache alias to it causing
254 	 * all sorts of fun problems ...
255 	 */
256 	ClearPageDirty(page);
257 	ClearPageUptodate(page);
258 	remove_inode_page(page);
259 	page_cache_release(page);
260 }
261 
262 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
truncate_list_pages(struct list_head * head,unsigned long start,unsigned * partial)263 static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
264 {
265 	struct list_head *curr;
266 	struct page * page;
267 	int unlocked = 0;
268 
269  restart:
270 	curr = head->prev;
271 	while (curr != head) {
272 		unsigned long offset;
273 
274 		page = list_entry(curr, struct page, list);
275 		offset = page->index;
276 
277 		/* Is one of the pages to truncate? */
278 		if ((offset >= start) || (*partial && (offset + 1) == start)) {
279 			int failed;
280 
281 			page_cache_get(page);
282 			failed = TryLockPage(page);
283 
284 			list_del(head);
285 			if (!failed)
286 				/* Restart after this page */
287 				list_add_tail(head, curr);
288 			else
289 				/* Restart on this page */
290 				list_add(head, curr);
291 
292 			spin_unlock(&pagecache_lock);
293 			unlocked = 1;
294 
295  			if (!failed) {
296 				if (*partial && (offset + 1) == start) {
297 					truncate_partial_page(page, *partial);
298 					*partial = 0;
299 				} else
300 					truncate_complete_page(page);
301 
302 				UnlockPage(page);
303 			} else
304  				wait_on_page(page);
305 
306 			page_cache_release(page);
307 
308 			if (current->need_resched) {
309 				__set_current_state(TASK_RUNNING);
310 				schedule();
311 			}
312 
313 			spin_lock(&pagecache_lock);
314 			goto restart;
315 		}
316 		curr = curr->prev;
317 	}
318 	return unlocked;
319 }
320 
321 
322 /**
323  * truncate_inode_pages - truncate *all* the pages from an offset
324  * @mapping: mapping to truncate
325  * @lstart: offset from with to truncate
326  *
327  * Truncate the page cache at a set offset, removing the pages
328  * that are beyond that offset (and zeroing out partial pages).
329  * If any page is locked we wait for it to become unlocked.
330  */
truncate_inode_pages(struct address_space * mapping,loff_t lstart)331 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
332 {
333 	unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
334 	unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
335 	int unlocked;
336 
337 	spin_lock(&pagecache_lock);
338 	do {
339 		unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
340 		unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
341 		unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
342 	} while (unlocked);
343 	/* Traversed all three lists without dropping the lock */
344 	spin_unlock(&pagecache_lock);
345 }
346 
invalidate_this_page2(struct page * page,struct list_head * curr,struct list_head * head)347 static inline int invalidate_this_page2(struct page * page,
348 					struct list_head * curr,
349 					struct list_head * head)
350 {
351 	int unlocked = 1;
352 
353 	/*
354 	 * The page is locked and we hold the pagecache_lock as well
355 	 * so both page_count(page) and page->buffers stays constant here.
356 	 */
357 	if (page_count(page) == 1 + !!page->buffers) {
358 		/* Restart after this page */
359 		list_del(head);
360 		list_add_tail(head, curr);
361 
362 		page_cache_get(page);
363 		spin_unlock(&pagecache_lock);
364 		truncate_complete_page(page);
365 	} else {
366 		if (page->buffers) {
367 			/* Restart after this page */
368 			list_del(head);
369 			list_add_tail(head, curr);
370 
371 			page_cache_get(page);
372 			spin_unlock(&pagecache_lock);
373 			block_invalidate_page(page);
374 		} else
375 			unlocked = 0;
376 
377 		ClearPageDirty(page);
378 		ClearPageUptodate(page);
379 	}
380 
381 	return unlocked;
382 }
383 
384 static int FASTCALL(invalidate_list_pages2(struct list_head *));
invalidate_list_pages2(struct list_head * head)385 static int fastcall invalidate_list_pages2(struct list_head *head)
386 {
387 	struct list_head *curr;
388 	struct page * page;
389 	int unlocked = 0;
390 
391  restart:
392 	curr = head->prev;
393 	while (curr != head) {
394 		page = list_entry(curr, struct page, list);
395 
396 		if (!TryLockPage(page)) {
397 			int __unlocked;
398 
399 			__unlocked = invalidate_this_page2(page, curr, head);
400 			UnlockPage(page);
401 			unlocked |= __unlocked;
402 			if (!__unlocked) {
403 				curr = curr->prev;
404 				continue;
405 			}
406 		} else {
407 			/* Restart on this page */
408 			list_del(head);
409 			list_add(head, curr);
410 
411 			page_cache_get(page);
412 			spin_unlock(&pagecache_lock);
413 			unlocked = 1;
414 			wait_on_page(page);
415 		}
416 
417 		page_cache_release(page);
418 		if (current->need_resched) {
419 			__set_current_state(TASK_RUNNING);
420 			schedule();
421 		}
422 
423 		spin_lock(&pagecache_lock);
424 		goto restart;
425 	}
426 	return unlocked;
427 }
428 
429 /**
430  * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
431  * free the pages because they're mapped.
432  * @mapping: the address_space which pages we want to invalidate
433  */
invalidate_inode_pages2(struct address_space * mapping)434 void invalidate_inode_pages2(struct address_space * mapping)
435 {
436 	int unlocked;
437 
438 	spin_lock(&pagecache_lock);
439 	do {
440 		unlocked = invalidate_list_pages2(&mapping->clean_pages);
441 		unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
442 		unlocked |= invalidate_list_pages2(&mapping->locked_pages);
443 	} while (unlocked);
444 	spin_unlock(&pagecache_lock);
445 }
446 
__find_page_nolock(struct address_space * mapping,unsigned long offset,struct page * page)447 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
448 {
449 	goto inside;
450 
451 	for (;;) {
452 		page = page->next_hash;
453 inside:
454 		if (!page)
455 			goto not_found;
456 		if (page->mapping != mapping)
457 			continue;
458 		if (page->index == offset)
459 			break;
460 	}
461 
462 not_found:
463 	return page;
464 }
465 
do_buffer_fdatasync(struct list_head * head,unsigned long start,unsigned long end,int (* fn)(struct page *))466 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
467 {
468 	struct list_head *curr;
469 	struct page *page;
470 	int retval = 0;
471 
472 	spin_lock(&pagecache_lock);
473 	curr = head->next;
474 	while (curr != head) {
475 		page = list_entry(curr, struct page, list);
476 		curr = curr->next;
477 		if (!page->buffers)
478 			continue;
479 		if (page->index >= end)
480 			continue;
481 		if (page->index < start)
482 			continue;
483 
484 		page_cache_get(page);
485 		spin_unlock(&pagecache_lock);
486 		lock_page(page);
487 
488 		/* The buffers could have been free'd while we waited for the page lock */
489 		if (page->buffers)
490 			retval |= fn(page);
491 
492 		UnlockPage(page);
493 		spin_lock(&pagecache_lock);
494 		curr = page->list.next;
495 		page_cache_release(page);
496 	}
497 	spin_unlock(&pagecache_lock);
498 
499 	return retval;
500 }
501 
502 /*
503  * Two-stage data sync: first start the IO, then go back and
504  * collect the information..
505  */
generic_buffer_fdatasync(struct inode * inode,unsigned long start_idx,unsigned long end_idx)506 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
507 {
508 	int retval;
509 
510 	/* writeout dirty buffers on pages from both clean and dirty lists */
511 	retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
512 	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
513 	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
514 
515 	/* now wait for locked buffers on pages from both clean and dirty lists */
516 	retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
517 	retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
518 	retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
519 
520 	return retval;
521 }
522 
523 /*
524  * In-memory filesystems have to fail their
525  * writepage function - and this has to be
526  * worked around in the VM layer..
527  *
528  * We
529  *  - mark the page dirty again (but do NOT
530  *    add it back to the inode dirty list, as
531  *    that would livelock in fdatasync)
532  *  - activate the page so that the page stealer
533  *    doesn't try to write it out over and over
534  *    again.
535  */
fail_writepage(struct page * page)536 int fail_writepage(struct page *page)
537 {
538 	/* Only activate on memory-pressure, not fsync.. */
539 	if (PageLaunder(page)) {
540 		activate_page(page);
541 		SetPageReferenced(page);
542 	}
543 
544 	/* Set the page dirty again, unlock */
545 	SetPageDirty(page);
546 	UnlockPage(page);
547 	return 0;
548 }
549 
550 EXPORT_SYMBOL(fail_writepage);
551 
552 /**
553  *      filemap_fdatawrite - walk the list of dirty pages of the given address space
554  *     	and writepage() each unlocked page (does not wait on locked pages).
555  *
556  *      @mapping: address space structure to write
557  *
558  */
filemap_fdatawrite(struct address_space * mapping)559 int filemap_fdatawrite(struct address_space * mapping)
560 {
561 	int ret = 0;
562 	int (*writepage)(struct page *) = mapping->a_ops->writepage;
563 
564 	spin_lock(&pagecache_lock);
565 
566 	while (!list_empty(&mapping->dirty_pages)) {
567 		struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
568 
569 		list_del(&page->list);
570 		list_add(&page->list, &mapping->locked_pages);
571 
572 		if (!PageDirty(page))
573 			continue;
574 
575 		page_cache_get(page);
576 		spin_unlock(&pagecache_lock);
577 
578 		if (!TryLockPage(page)) {
579 			if (PageDirty(page)) {
580 				int err;
581 				ClearPageDirty(page);
582 				err = writepage(page);
583 				if (err && !ret)
584 					ret = err;
585 			} else
586 				UnlockPage(page);
587 		}
588 		page_cache_release(page);
589 		spin_lock(&pagecache_lock);
590 	}
591 	spin_unlock(&pagecache_lock);
592 	return ret;
593 }
594 
595 /**
596  *      filemap_fdatasync - walk the list of dirty pages of the given address space
597  *     	and writepage() all of them.
598  *
599  *      @mapping: address space structure to write
600  *
601  */
filemap_fdatasync(struct address_space * mapping)602 int filemap_fdatasync(struct address_space * mapping)
603 {
604 	int ret = 0;
605 	int (*writepage)(struct page *) = mapping->a_ops->writepage;
606 
607 	spin_lock(&pagecache_lock);
608 
609         while (!list_empty(&mapping->dirty_pages)) {
610 		struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
611 
612 		list_del(&page->list);
613 		list_add(&page->list, &mapping->locked_pages);
614 
615 		if (!PageDirty(page))
616 			continue;
617 
618 		page_cache_get(page);
619 		spin_unlock(&pagecache_lock);
620 
621 		lock_page(page);
622 
623 		if (PageDirty(page)) {
624 			int err;
625 			ClearPageDirty(page);
626 			err = writepage(page);
627 			if (err && !ret)
628 				ret = err;
629 		} else
630 			UnlockPage(page);
631 
632 		page_cache_release(page);
633 		spin_lock(&pagecache_lock);
634 	}
635 	spin_unlock(&pagecache_lock);
636 	return ret;
637 }
638 
639 /**
640  *      filemap_fdatawait - walk the list of locked pages of the given address space
641  *     	and wait for all of them.
642  *
643  *      @mapping: address space structure to wait for
644  *
645  */
filemap_fdatawait(struct address_space * mapping)646 int filemap_fdatawait(struct address_space * mapping)
647 {
648 	int ret = 0;
649 
650 	spin_lock(&pagecache_lock);
651 
652         while (!list_empty(&mapping->locked_pages)) {
653 		struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
654 
655 		list_del(&page->list);
656 		list_add(&page->list, &mapping->clean_pages);
657 
658 		if (!PageLocked(page))
659 			continue;
660 
661 		page_cache_get(page);
662 		spin_unlock(&pagecache_lock);
663 
664 		___wait_on_page(page);
665 		if (PageError(page))
666 			ret = -EIO;
667 
668 		page_cache_release(page);
669 		spin_lock(&pagecache_lock);
670 	}
671 	spin_unlock(&pagecache_lock);
672 	return ret;
673 }
674 
675 /*
676  * Add a page to the inode page cache.
677  *
678  * The caller must have locked the page and
679  * set all the page flags correctly..
680  */
add_to_page_cache_locked(struct page * page,struct address_space * mapping,unsigned long index)681 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
682 {
683 	if (!PageLocked(page))
684 		BUG();
685 
686 	page->index = index;
687 	page_cache_get(page);
688 	spin_lock(&pagecache_lock);
689 	add_page_to_inode_queue(mapping, page);
690 	add_page_to_hash_queue(page, page_hash(mapping, index));
691 	spin_unlock(&pagecache_lock);
692 
693 	lru_cache_add(page);
694 }
695 
696 /*
697  * This adds a page to the page cache, starting out as locked,
698  * owned by us, but unreferenced, not uptodate and with no errors.
699  */
__add_to_page_cache(struct page * page,struct address_space * mapping,unsigned long offset,struct page ** hash)700 static inline void __add_to_page_cache(struct page * page,
701 	struct address_space *mapping, unsigned long offset,
702 	struct page **hash)
703 {
704 	/*
705 	 * Yes this is inefficient, however it is needed.  The problem
706 	 * is that we could be adding a page to the swap cache while
707 	 * another CPU is also modifying page->flags, so the updates
708 	 * really do need to be atomic.  -- Rik
709 	 */
710 	ClearPageUptodate(page);
711 	ClearPageError(page);
712 	ClearPageDirty(page);
713 	ClearPageReferenced(page);
714 	ClearPageArch1(page);
715 	ClearPageChecked(page);
716 	LockPage(page);
717 	page_cache_get(page);
718 	page->index = offset;
719 	add_page_to_inode_queue(mapping, page);
720 	add_page_to_hash_queue(page, hash);
721 }
722 
add_to_page_cache(struct page * page,struct address_space * mapping,unsigned long offset)723 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
724 {
725 	spin_lock(&pagecache_lock);
726 	__add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
727 	spin_unlock(&pagecache_lock);
728 	lru_cache_add(page);
729 }
730 
add_to_page_cache_unique(struct page * page,struct address_space * mapping,unsigned long offset,struct page ** hash)731 int add_to_page_cache_unique(struct page * page,
732 	struct address_space *mapping, unsigned long offset,
733 	struct page **hash)
734 {
735 	int err;
736 	struct page *alias;
737 
738 	spin_lock(&pagecache_lock);
739 	alias = __find_page_nolock(mapping, offset, *hash);
740 
741 	err = 1;
742 	if (!alias) {
743 		__add_to_page_cache(page,mapping,offset,hash);
744 		err = 0;
745 	}
746 
747 	spin_unlock(&pagecache_lock);
748 	if (!err)
749 		lru_cache_add(page);
750 	return err;
751 }
752 
753 /*
754  * This adds the requested page to the page cache if it isn't already there,
755  * and schedules an I/O to read in its contents from disk.
756  */
757 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
page_cache_read(struct file * file,unsigned long offset)758 static int fastcall page_cache_read(struct file * file, unsigned long offset)
759 {
760 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
761 	struct page **hash = page_hash(mapping, offset);
762 	struct page *page;
763 
764 	spin_lock(&pagecache_lock);
765 	page = __find_page_nolock(mapping, offset, *hash);
766 	spin_unlock(&pagecache_lock);
767 	if (page)
768 		return 0;
769 
770 	page = page_cache_alloc(mapping);
771 	if (!page)
772 		return -ENOMEM;
773 
774 	if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
775 		int error = mapping->a_ops->readpage(file, page);
776 		page_cache_release(page);
777 		return error;
778 	}
779 	/*
780 	 * We arrive here in the unlikely event that someone
781 	 * raced with us and added our page to the cache first.
782 	 */
783 	page_cache_release(page);
784 	return 0;
785 }
786 
787 /*
788  * Read in an entire cluster at once.  A cluster is usually a 64k-
789  * aligned block that includes the page requested in "offset."
790  */
791 static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
792 					     unsigned long filesize));
read_cluster_nonblocking(struct file * file,unsigned long offset,unsigned long filesize)793 static int fastcall read_cluster_nonblocking(struct file * file, unsigned long offset,
794 	unsigned long filesize)
795 {
796 	unsigned long pages = CLUSTER_PAGES;
797 
798 	offset = CLUSTER_OFFSET(offset);
799 	while ((pages-- > 0) && (offset < filesize)) {
800 		int error = page_cache_read(file, offset);
801 		if (error < 0)
802 			return error;
803 		offset ++;
804 	}
805 
806 	return 0;
807 }
808 
809 /*
810  * Knuth recommends primes in approximately golden ratio to the maximum
811  * integer representable by a machine word for multiplicative hashing.
812  * Chuck Lever verified the effectiveness of this technique:
813  * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
814  *
815  * These primes are chosen to be bit-sparse, that is operations on
816  * them can use shifts and additions instead of multiplications for
817  * machines where multiplications are slow.
818  */
819 #if BITS_PER_LONG == 32
820 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
821 #define GOLDEN_RATIO_PRIME 0x9e370001UL
822 #elif BITS_PER_LONG == 64
823 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
824 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
825 #else
826 #error Define GOLDEN_RATIO_PRIME for your wordsize.
827 #endif
828 
829 /*
830  * In order to wait for pages to become available there must be
831  * waitqueues associated with pages. By using a hash table of
832  * waitqueues where the bucket discipline is to maintain all
833  * waiters on the same queue and wake all when any of the pages
834  * become available, and for the woken contexts to check to be
835  * sure the appropriate page became available, this saves space
836  * at a cost of "thundering herd" phenomena during rare hash
837  * collisions.
838  */
page_waitqueue(struct page * page)839 static inline wait_queue_head_t *page_waitqueue(struct page *page)
840 {
841 	const zone_t *zone = page_zone(page);
842 	wait_queue_head_t *wait = zone->wait_table;
843 	unsigned long hash = (unsigned long)page;
844 
845 #if BITS_PER_LONG == 64
846 	/*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
847 	unsigned long n = hash;
848 	n <<= 18;
849 	hash -= n;
850 	n <<= 33;
851 	hash -= n;
852 	n <<= 3;
853 	hash += n;
854 	n <<= 3;
855 	hash -= n;
856 	n <<= 4;
857 	hash += n;
858 	n <<= 2;
859 	hash += n;
860 #else
861 	/* On some cpus multiply is faster, on others gcc will do shifts */
862 	hash *= GOLDEN_RATIO_PRIME;
863 #endif
864 	hash >>= zone->wait_table_shift;
865 
866 	return &wait[hash];
867 }
868 
869 /*
870  * This must be called after every submit_bh with end_io
871  * callbacks that would result into the blkdev layer waking
872  * up the page after a queue unplug.
873  */
wakeup_page_waiters(struct page * page)874 void fastcall wakeup_page_waiters(struct page * page)
875 {
876 	wait_queue_head_t * head;
877 
878 	head = page_waitqueue(page);
879 	if (waitqueue_active(head))
880 		wake_up(head);
881 }
882 
883 /*
884  * Wait for a page to get unlocked.
885  *
886  * This must be called with the caller "holding" the page,
887  * ie with increased "page->count" so that the page won't
888  * go away during the wait..
889  *
890  * The waiting strategy is to get on a waitqueue determined
891  * by hashing. Waiters will then collide, and the newly woken
892  * task must then determine whether it was woken for the page
893  * it really wanted, and go back to sleep on the waitqueue if
894  * that wasn't it. With the waitqueue semantics, it never leaves
895  * the waitqueue unless it calls, so the loop moves forward one
896  * iteration every time there is
897  * (1) a collision
898  * and
899  * (2) one of the colliding pages is woken
900  *
901  * This is the thundering herd problem, but it is expected to
902  * be very rare due to the few pages that are actually being
903  * waited on at any given time and the quality of the hash function.
904  */
___wait_on_page(struct page * page)905 void ___wait_on_page(struct page *page)
906 {
907 	wait_queue_head_t *waitqueue = page_waitqueue(page);
908 	struct task_struct *tsk = current;
909 	DECLARE_WAITQUEUE(wait, tsk);
910 
911 	add_wait_queue(waitqueue, &wait);
912 	do {
913 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
914 		if (!PageLocked(page))
915 			break;
916 		sync_page(page);
917 		schedule();
918 	} while (PageLocked(page));
919 	__set_task_state(tsk, TASK_RUNNING);
920 	remove_wait_queue(waitqueue, &wait);
921 }
922 
923 /*
924  * unlock_page() is the other half of the story just above
925  * __wait_on_page(). Here a couple of quick checks are done
926  * and a couple of flags are set on the page, and then all
927  * of the waiters for all of the pages in the appropriate
928  * wait queue are woken.
929  */
unlock_page(struct page * page)930 void fastcall unlock_page(struct page *page)
931 {
932 	wait_queue_head_t *waitqueue = page_waitqueue(page);
933 	ClearPageLaunder(page);
934 	smp_mb__before_clear_bit();
935 	if (!test_and_clear_bit(PG_locked, &(page)->flags))
936 		BUG();
937 	smp_mb__after_clear_bit();
938 
939 	/*
940 	 * Although the default semantics of wake_up() are
941 	 * to wake all, here the specific function is used
942 	 * to make it even more explicit that a number of
943 	 * pages are being waited on here.
944 	 */
945 	if (waitqueue_active(waitqueue))
946 		wake_up_all(waitqueue);
947 }
948 
949 /*
950  * Get a lock on the page, assuming we need to sleep
951  * to get it..
952  */
__lock_page(struct page * page)953 static void __lock_page(struct page *page)
954 {
955 	wait_queue_head_t *waitqueue = page_waitqueue(page);
956 	struct task_struct *tsk = current;
957 	DECLARE_WAITQUEUE(wait, tsk);
958 
959 	add_wait_queue_exclusive(waitqueue, &wait);
960 	for (;;) {
961 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
962 		if (PageLocked(page)) {
963 			sync_page(page);
964 			schedule();
965 		}
966 		if (!TryLockPage(page))
967 			break;
968 	}
969 	__set_task_state(tsk, TASK_RUNNING);
970 	remove_wait_queue(waitqueue, &wait);
971 }
972 
973 /*
974  * Get an exclusive lock on the page, optimistically
975  * assuming it's not locked..
976  */
lock_page(struct page * page)977 void fastcall lock_page(struct page *page)
978 {
979 	if (TryLockPage(page))
980 		__lock_page(page);
981 }
982 
983 /*
984  * a rather lightweight function, finding and getting a reference to a
985  * hashed page atomically.
986  */
__find_get_page(struct address_space * mapping,unsigned long offset,struct page ** hash)987 struct page * __find_get_page(struct address_space *mapping,
988 			      unsigned long offset, struct page **hash)
989 {
990 	struct page *page;
991 
992 	/*
993 	 * We scan the hash list read-only. Addition to and removal from
994 	 * the hash-list needs a held write-lock.
995 	 */
996 	spin_lock(&pagecache_lock);
997 	page = __find_page_nolock(mapping, offset, *hash);
998 	if (page)
999 		page_cache_get(page);
1000 	spin_unlock(&pagecache_lock);
1001 	return page;
1002 }
1003 
1004 /*
1005  * Same as above, but trylock it instead of incrementing the count.
1006  */
find_trylock_page(struct address_space * mapping,unsigned long offset)1007 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
1008 {
1009 	struct page *page;
1010 	struct page **hash = page_hash(mapping, offset);
1011 
1012 	spin_lock(&pagecache_lock);
1013 	page = __find_page_nolock(mapping, offset, *hash);
1014 	if (page) {
1015 		if (TryLockPage(page))
1016 			page = NULL;
1017 	}
1018 	spin_unlock(&pagecache_lock);
1019 	return page;
1020 }
1021 
1022 /*
1023  * Must be called with the pagecache lock held,
1024  * will return with it held (but it may be dropped
1025  * during blocking operations..
1026  */
1027 static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
__find_lock_page_helper(struct address_space * mapping,unsigned long offset,struct page * hash)1028 static struct page * fastcall __find_lock_page_helper(struct address_space *mapping,
1029 					unsigned long offset, struct page *hash)
1030 {
1031 	struct page *page;
1032 
1033 	/*
1034 	 * We scan the hash list read-only. Addition to and removal from
1035 	 * the hash-list needs a held write-lock.
1036 	 */
1037 repeat:
1038 	page = __find_page_nolock(mapping, offset, hash);
1039 	if (page) {
1040 		page_cache_get(page);
1041 		if (TryLockPage(page)) {
1042 			spin_unlock(&pagecache_lock);
1043 			lock_page(page);
1044 			spin_lock(&pagecache_lock);
1045 
1046 			/* Has the page been re-allocated while we slept? */
1047 			if (page->mapping != mapping || page->index != offset) {
1048 				UnlockPage(page);
1049 				page_cache_release(page);
1050 				goto repeat;
1051 			}
1052 		}
1053 	}
1054 	return page;
1055 }
1056 
1057 /*
1058  * Same as the above, but lock the page too, verifying that
1059  * it's still valid once we own it.
1060  */
__find_lock_page(struct address_space * mapping,unsigned long offset,struct page ** hash)1061 struct page * __find_lock_page (struct address_space *mapping,
1062 				unsigned long offset, struct page **hash)
1063 {
1064 	struct page *page;
1065 
1066 	spin_lock(&pagecache_lock);
1067 	page = __find_lock_page_helper(mapping, offset, *hash);
1068 	spin_unlock(&pagecache_lock);
1069 	return page;
1070 }
1071 
1072 /*
1073  * Same as above, but create the page if required..
1074  */
find_or_create_page(struct address_space * mapping,unsigned long index,unsigned int gfp_mask)1075 struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
1076 {
1077 	struct page *page;
1078 	struct page **hash = page_hash(mapping, index);
1079 
1080 	spin_lock(&pagecache_lock);
1081 	page = __find_lock_page_helper(mapping, index, *hash);
1082 	spin_unlock(&pagecache_lock);
1083 	if (!page) {
1084 		struct page *newpage = alloc_page(gfp_mask);
1085 		if (newpage) {
1086 			spin_lock(&pagecache_lock);
1087 			page = __find_lock_page_helper(mapping, index, *hash);
1088 			if (likely(!page)) {
1089 				page = newpage;
1090 				__add_to_page_cache(page, mapping, index, hash);
1091 				newpage = NULL;
1092 			}
1093 			spin_unlock(&pagecache_lock);
1094 			if (newpage == NULL)
1095 				lru_cache_add(page);
1096 			else
1097 				page_cache_release(newpage);
1098 		}
1099 	}
1100 	return page;
1101 }
1102 
1103 /*
1104  * Same as grab_cache_page, but do not wait if the page is unavailable.
1105  * This is intended for speculative data generators, where the data can
1106  * be regenerated if the page couldn't be grabbed.  This routine should
1107  * be safe to call while holding the lock for another page.
1108  */
grab_cache_page_nowait(struct address_space * mapping,unsigned long index)1109 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
1110 {
1111 	struct page *page, **hash;
1112 
1113 	hash = page_hash(mapping, index);
1114 	page = __find_get_page(mapping, index, hash);
1115 
1116 	if ( page ) {
1117 		if ( !TryLockPage(page) ) {
1118 			/* Page found and locked */
1119 			/* This test is overly paranoid, but what the heck... */
1120 			if ( unlikely(page->mapping != mapping || page->index != index) ) {
1121 				/* Someone reallocated this page under us. */
1122 				UnlockPage(page);
1123 				page_cache_release(page);
1124 				return NULL;
1125 			} else {
1126 				return page;
1127 			}
1128 		} else {
1129 			/* Page locked by someone else */
1130 			page_cache_release(page);
1131 			return NULL;
1132 		}
1133 	}
1134 
1135 	page = page_cache_alloc(mapping);
1136 	if ( unlikely(!page) )
1137 		return NULL;	/* Failed to allocate a page */
1138 
1139 	if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
1140 		/* Someone else grabbed the page already. */
1141 		page_cache_release(page);
1142 		return NULL;
1143 	}
1144 
1145 	return page;
1146 }
1147 
1148 #if 0
1149 #define PROFILE_READAHEAD
1150 #define DEBUG_READAHEAD
1151 #endif
1152 
1153 /*
1154  * Read-ahead profiling information
1155  * --------------------------------
1156  * Every PROFILE_MAXREADCOUNT, the following information is written
1157  * to the syslog:
1158  *   Percentage of asynchronous read-ahead.
1159  *   Average of read-ahead fields context value.
1160  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
1161  * to the syslog.
1162  */
1163 
1164 #ifdef PROFILE_READAHEAD
1165 
1166 #define PROFILE_MAXREADCOUNT 1000
1167 
1168 static unsigned long total_reada;
1169 static unsigned long total_async;
1170 static unsigned long total_ramax;
1171 static unsigned long total_ralen;
1172 static unsigned long total_rawin;
1173 
profile_readahead(int async,struct file * filp)1174 static void profile_readahead(int async, struct file *filp)
1175 {
1176 	unsigned long flags;
1177 
1178 	++total_reada;
1179 	if (async)
1180 		++total_async;
1181 
1182 	total_ramax	+= filp->f_ramax;
1183 	total_ralen	+= filp->f_ralen;
1184 	total_rawin	+= filp->f_rawin;
1185 
1186 	if (total_reada > PROFILE_MAXREADCOUNT) {
1187 		save_flags(flags);
1188 		cli();
1189 		if (!(total_reada > PROFILE_MAXREADCOUNT)) {
1190 			restore_flags(flags);
1191 			return;
1192 		}
1193 
1194 		printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
1195 			total_ramax/total_reada,
1196 			total_ralen/total_reada,
1197 			total_rawin/total_reada,
1198 			(total_async*100)/total_reada);
1199 #ifdef DEBUG_READAHEAD
1200 		printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
1201 			filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
1202 #endif
1203 
1204 		total_reada	= 0;
1205 		total_async	= 0;
1206 		total_ramax	= 0;
1207 		total_ralen	= 0;
1208 		total_rawin	= 0;
1209 
1210 		restore_flags(flags);
1211 	}
1212 }
1213 #endif  /* defined PROFILE_READAHEAD */
1214 
1215 /*
1216  * Read-ahead context:
1217  * -------------------
1218  * The read ahead context fields of the "struct file" are the following:
1219  * - f_raend : position of the first byte after the last page we tried to
1220  *	       read ahead.
1221  * - f_ramax : current read-ahead maximum size.
1222  * - f_ralen : length of the current IO read block we tried to read-ahead.
1223  * - f_rawin : length of the current read-ahead window.
1224  *		if last read-ahead was synchronous then
1225  *			f_rawin = f_ralen
1226  *		otherwise (was asynchronous)
1227  *			f_rawin = previous value of f_ralen + f_ralen
1228  *
1229  * Read-ahead limits:
1230  * ------------------
1231  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
1232  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
1233  *
1234  * Synchronous read-ahead benefits:
1235  * --------------------------------
1236  * Using reasonable IO xfer length from peripheral devices increase system
1237  * performances.
1238  * Reasonable means, in this context, not too large but not too small.
1239  * The actual maximum value is:
1240  *	MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
1241  *      and 32K if defined (4K page size assumed).
1242  *
1243  * Asynchronous read-ahead benefits:
1244  * ---------------------------------
1245  * Overlapping next read request and user process execution increase system
1246  * performance.
1247  *
1248  * Read-ahead risks:
1249  * -----------------
1250  * We have to guess which further data are needed by the user process.
1251  * If these data are often not really needed, it's bad for system
1252  * performances.
1253  * However, we know that files are often accessed sequentially by
1254  * application programs and it seems that it is possible to have some good
1255  * strategy in that guessing.
1256  * We only try to read-ahead files that seems to be read sequentially.
1257  *
1258  * Asynchronous read-ahead risks:
1259  * ------------------------------
1260  * In order to maximize overlapping, we must start some asynchronous read
1261  * request from the device, as soon as possible.
1262  * We must be very careful about:
1263  * - The number of effective pending IO read requests.
1264  *   ONE seems to be the only reasonable value.
1265  * - The total memory pool usage for the file access stream.
1266  *   This maximum memory usage is implicitly 2 IO read chunks:
1267  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
1268  *   64k if defined (4K page size assumed).
1269  */
1270 
get_max_readahead(struct inode * inode)1271 static inline int get_max_readahead(struct inode * inode)
1272 {
1273 	if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
1274 		return vm_max_readahead;
1275 	return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
1276 }
1277 
generic_file_readahead(int reada_ok,struct file * filp,struct inode * inode,struct page * page)1278 static void generic_file_readahead(int reada_ok,
1279 	struct file * filp, struct inode * inode,
1280 	struct page * page)
1281 {
1282 	unsigned long end_index;
1283 	unsigned long index = page->index;
1284 	unsigned long max_ahead, ahead;
1285 	unsigned long raend;
1286 	int max_readahead = get_max_readahead(inode);
1287 
1288 	end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1289 
1290 	raend = filp->f_raend;
1291 	max_ahead = 0;
1292 
1293 /*
1294  * The current page is locked.
1295  * If the current position is inside the previous read IO request, do not
1296  * try to reread previously read ahead pages.
1297  * Otherwise decide or not to read ahead some pages synchronously.
1298  * If we are not going to read ahead, set the read ahead context for this
1299  * page only.
1300  */
1301 	if (PageLocked(page)) {
1302 		if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1303 			raend = index;
1304 			if (raend < end_index)
1305 				max_ahead = filp->f_ramax;
1306 			filp->f_rawin = 0;
1307 			filp->f_ralen = 1;
1308 			if (!max_ahead) {
1309 				filp->f_raend  = index + filp->f_ralen;
1310 				filp->f_rawin += filp->f_ralen;
1311 			}
1312 		}
1313 	}
1314 /*
1315  * The current page is not locked.
1316  * If we were reading ahead and,
1317  * if the current max read ahead size is not zero and,
1318  * if the current position is inside the last read-ahead IO request,
1319  *   it is the moment to try to read ahead asynchronously.
1320  * We will later force unplug device in order to force asynchronous read IO.
1321  */
1322 	else if (reada_ok && filp->f_ramax && raend >= 1 &&
1323 		 index <= raend && index + filp->f_ralen >= raend) {
1324 /*
1325  * Add ONE page to max_ahead in order to try to have about the same IO max size
1326  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1327  * Compute the position of the last page we have tried to read in order to
1328  * begin to read ahead just at the next page.
1329  */
1330 		raend -= 1;
1331 		if (raend < end_index)
1332 			max_ahead = filp->f_ramax + 1;
1333 
1334 		if (max_ahead) {
1335 			filp->f_rawin = filp->f_ralen;
1336 			filp->f_ralen = 0;
1337 			reada_ok      = 2;
1338 		}
1339 	}
1340 /*
1341  * Try to read ahead pages.
1342  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1343  * scheduler, will work enough for us to avoid too bad actuals IO requests.
1344  */
1345 	ahead = 0;
1346 	while (ahead < max_ahead) {
1347 		unsigned long ra_index = raend + ahead + 1;
1348 
1349 		if (ra_index >= end_index)
1350 			break;
1351 		if (page_cache_read(filp, ra_index) < 0)
1352 			break;
1353 
1354 		ahead++;
1355 	}
1356 /*
1357  * If we tried to read ahead some pages,
1358  * If we tried to read ahead asynchronously,
1359  *   Try to force unplug of the device in order to start an asynchronous
1360  *   read IO request.
1361  * Update the read-ahead context.
1362  * Store the length of the current read-ahead window.
1363  * Double the current max read ahead size.
1364  *   That heuristic avoid to do some large IO for files that are not really
1365  *   accessed sequentially.
1366  */
1367 	if (ahead) {
1368 		filp->f_ralen += ahead;
1369 		filp->f_rawin += filp->f_ralen;
1370 		filp->f_raend = raend + ahead + 1;
1371 
1372 		filp->f_ramax += filp->f_ramax;
1373 
1374 		if (filp->f_ramax > max_readahead)
1375 			filp->f_ramax = max_readahead;
1376 
1377 #ifdef PROFILE_READAHEAD
1378 		profile_readahead((reada_ok == 2), filp);
1379 #endif
1380 	}
1381 
1382 	return;
1383 }
1384 
1385 /*
1386  * Mark a page as having seen activity.
1387  *
1388  * If it was already so marked, move it to the active queue and drop
1389  * the referenced bit.  Otherwise, just mark it for future action..
1390  */
mark_page_accessed(struct page * page)1391 void fastcall mark_page_accessed(struct page *page)
1392 {
1393 	if (!PageActive(page) && PageReferenced(page)) {
1394 		activate_page(page);
1395 		ClearPageReferenced(page);
1396 	} else
1397 		SetPageReferenced(page);
1398 }
1399 
1400 /*
1401  * This is a generic file read routine, and uses the
1402  * inode->i_op->readpage() function for the actual low-level
1403  * stuff.
1404  *
1405  * This is really ugly. But the goto's actually try to clarify some
1406  * of the logic when it comes to error handling etc.
1407  */
do_generic_file_read(struct file * filp,loff_t * ppos,read_descriptor_t * desc,read_actor_t actor)1408 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1409 {
1410 	struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1411 	struct inode *inode = mapping->host;
1412 	unsigned long index, offset;
1413 	struct page *cached_page;
1414 	int reada_ok;
1415 	int error;
1416 	int max_readahead = get_max_readahead(inode);
1417 
1418 	cached_page = NULL;
1419 	index = *ppos >> PAGE_CACHE_SHIFT;
1420 	offset = *ppos & ~PAGE_CACHE_MASK;
1421 
1422 /*
1423  * If the current position is outside the previous read-ahead window,
1424  * we reset the current read-ahead context and set read ahead max to zero
1425  * (will be set to just needed value later),
1426  * otherwise, we assume that the file accesses are sequential enough to
1427  * continue read-ahead.
1428  */
1429 	if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1430 		reada_ok = 0;
1431 		filp->f_raend = 0;
1432 		filp->f_ralen = 0;
1433 		filp->f_ramax = 0;
1434 		filp->f_rawin = 0;
1435 	} else {
1436 		reada_ok = 1;
1437 	}
1438 /*
1439  * Adjust the current value of read-ahead max.
1440  * If the read operation stay in the first half page, force no readahead.
1441  * Otherwise try to increase read ahead max just enough to do the read request.
1442  * Then, at least MIN_READAHEAD if read ahead is ok,
1443  * and at most MAX_READAHEAD in all cases.
1444  */
1445 	if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1446 		filp->f_ramax = 0;
1447 	} else {
1448 		unsigned long needed;
1449 
1450 		needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1451 
1452 		if (filp->f_ramax < needed)
1453 			filp->f_ramax = needed;
1454 
1455 		if (reada_ok && filp->f_ramax < vm_min_readahead)
1456 				filp->f_ramax = vm_min_readahead;
1457 		if (filp->f_ramax > max_readahead)
1458 			filp->f_ramax = max_readahead;
1459 	}
1460 
1461 	for (;;) {
1462 		struct page *page, **hash;
1463 		unsigned long end_index, nr, ret;
1464 
1465 		end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1466 
1467 		if (index > end_index)
1468 			break;
1469 		nr = PAGE_CACHE_SIZE;
1470 		if (index == end_index) {
1471 			nr = inode->i_size & ~PAGE_CACHE_MASK;
1472 			if (nr <= offset)
1473 				break;
1474 		}
1475 
1476 		nr = nr - offset;
1477 
1478 		/*
1479 		 * Try to find the data in the page cache..
1480 		 */
1481 		hash = page_hash(mapping, index);
1482 
1483 		spin_lock(&pagecache_lock);
1484 		page = __find_page_nolock(mapping, index, *hash);
1485 		if (!page)
1486 			goto no_cached_page;
1487 found_page:
1488 		page_cache_get(page);
1489 		spin_unlock(&pagecache_lock);
1490 
1491 		if (!Page_Uptodate(page))
1492 			goto page_not_up_to_date;
1493 		generic_file_readahead(reada_ok, filp, inode, page);
1494 page_ok:
1495 		/* If users can be writing to this page using arbitrary
1496 		 * virtual addresses, take care about potential aliasing
1497 		 * before reading the page on the kernel side.
1498 		 */
1499 		if (mapping->i_mmap_shared != NULL)
1500 			flush_dcache_page(page);
1501 
1502 		/*
1503 		 * Mark the page accessed if we read the
1504 		 * beginning or we just did an lseek.
1505 		 */
1506 		if (!offset || !filp->f_reada)
1507 			mark_page_accessed(page);
1508 
1509 		/*
1510 		 * Ok, we have the page, and it's up-to-date, so
1511 		 * now we can copy it to user space...
1512 		 *
1513 		 * The actor routine returns how many bytes were actually used..
1514 		 * NOTE! This may not be the same as how much of a user buffer
1515 		 * we filled up (we may be padding etc), so we can only update
1516 		 * "pos" here (the actor routine has to update the user buffer
1517 		 * pointers and the remaining count).
1518 		 */
1519 		ret = actor(desc, page, offset, nr);
1520 		offset += ret;
1521 		index += offset >> PAGE_CACHE_SHIFT;
1522 		offset &= ~PAGE_CACHE_MASK;
1523 
1524 		page_cache_release(page);
1525 		if (ret == nr && desc->count)
1526 			continue;
1527 		break;
1528 
1529 /*
1530  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1531  */
1532 page_not_up_to_date:
1533 		generic_file_readahead(reada_ok, filp, inode, page);
1534 
1535 		if (Page_Uptodate(page))
1536 			goto page_ok;
1537 
1538 		/* Get exclusive access to the page ... */
1539 		lock_page(page);
1540 
1541 		/* Did it get unhashed before we got the lock? */
1542 		if (!page->mapping) {
1543 			UnlockPage(page);
1544 			page_cache_release(page);
1545 			continue;
1546 		}
1547 
1548 		/* Did somebody else fill it already? */
1549 		if (Page_Uptodate(page)) {
1550 			UnlockPage(page);
1551 			goto page_ok;
1552 		}
1553 
1554 readpage:
1555 		/* ... and start the actual read. The read will unlock the page. */
1556 		error = mapping->a_ops->readpage(filp, page);
1557 
1558 		if (!error) {
1559 			if (Page_Uptodate(page))
1560 				goto page_ok;
1561 
1562 			/* Again, try some read-ahead while waiting for the page to finish.. */
1563 			generic_file_readahead(reada_ok, filp, inode, page);
1564 			wait_on_page(page);
1565 			if (Page_Uptodate(page))
1566 				goto page_ok;
1567 			error = -EIO;
1568 		}
1569 
1570 		/* UHHUH! A synchronous read error occurred. Report it */
1571 		desc->error = error;
1572 		page_cache_release(page);
1573 		break;
1574 
1575 no_cached_page:
1576 		/*
1577 		 * Ok, it wasn't cached, so we need to create a new
1578 		 * page..
1579 		 *
1580 		 * We get here with the page cache lock held.
1581 		 */
1582 		if (!cached_page) {
1583 			spin_unlock(&pagecache_lock);
1584 			cached_page = page_cache_alloc(mapping);
1585 			if (!cached_page) {
1586 				desc->error = -ENOMEM;
1587 				break;
1588 			}
1589 
1590 			/*
1591 			 * Somebody may have added the page while we
1592 			 * dropped the page cache lock. Check for that.
1593 			 */
1594 			spin_lock(&pagecache_lock);
1595 			page = __find_page_nolock(mapping, index, *hash);
1596 			if (page)
1597 				goto found_page;
1598 		}
1599 
1600 		/*
1601 		 * Ok, add the new page to the hash-queues...
1602 		 */
1603 		page = cached_page;
1604 		__add_to_page_cache(page, mapping, index, hash);
1605 		spin_unlock(&pagecache_lock);
1606 		lru_cache_add(page);
1607 		cached_page = NULL;
1608 
1609 		goto readpage;
1610 	}
1611 
1612 	*ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1613 	filp->f_reada = 1;
1614 	if (cached_page)
1615 		page_cache_release(cached_page);
1616 	UPDATE_ATIME(inode);
1617 }
1618 
have_mapping_directIO(struct address_space * mapping)1619 static inline int have_mapping_directIO(struct address_space * mapping)
1620 {
1621 	return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
1622 }
1623 
1624 /* Switch between old and new directIO formats */
do_call_directIO(int rw,struct file * filp,struct kiobuf * iobuf,unsigned long offset,int blocksize)1625 static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
1626 {
1627 	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1628 
1629 	if (mapping->a_ops->direct_fileIO)
1630 		return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
1631 	return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
1632 }
1633 
1634 /*
1635  * i_sem and i_alloc_sem should be held already.  i_sem may be dropped
1636  * later once we've mapped the new IO.  i_alloc_sem is kept until the IO
1637  * completes.
1638  */
1639 
generic_file_direct_IO(int rw,struct file * filp,char * buf,size_t count,loff_t offset)1640 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1641 {
1642 	ssize_t retval, progress;
1643 	int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits;
1644 	ssize_t iosize;
1645 	struct kiobuf * iobuf;
1646 	struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1647 	struct inode * inode = mapping->host;
1648 	loff_t size = inode->i_size;
1649 
1650 	new_iobuf = 0;
1651 	iobuf = filp->f_iobuf;
1652 	if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1653 		/*
1654 		 * A parallel read/write is using the preallocated iobuf
1655 		 * so just run slow and allocate a new one.
1656 		 */
1657 		retval = alloc_kiovec(1, &iobuf);
1658 		if (retval)
1659 			goto out;
1660 		new_iobuf = 1;
1661 	}
1662 
1663 	blocksize = 1 << inode->i_blkbits;
1664 	blocksize_bits = inode->i_blkbits;
1665 	blocksize_mask = blocksize - 1;
1666 	chunk_size = KIO_MAX_ATOMIC_IO << 10;
1667 
1668 	retval = -EINVAL;
1669 	if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
1670 		goto out_free;
1671 	if (!have_mapping_directIO(mapping))
1672 		goto out_free;
1673 
1674 	if ((rw == READ) && (offset + count > size))
1675 		count = size - offset;
1676 
1677 	/*
1678 	 * Flush to disk exclusively the _data_, metadata must remain
1679 	 * completly asynchronous or performance will go to /dev/null.
1680 	 */
1681 	retval = filemap_fdatasync(mapping);
1682 	if (retval == 0)
1683 		retval = fsync_inode_data_buffers(inode);
1684 	if (retval == 0)
1685 		retval = filemap_fdatawait(mapping);
1686 	if (retval < 0)
1687 		goto out_free;
1688 
1689 	progress = retval = 0;
1690 	while (count > 0) {
1691 		iosize = count;
1692 		if (iosize > chunk_size)
1693 			iosize = chunk_size;
1694 
1695 		retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1696 		if (retval)
1697 			break;
1698 
1699 		retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1700 
1701 		if (rw == READ && retval > 0)
1702 			mark_dirty_kiobuf(iobuf, retval);
1703 
1704 		if (retval >= 0) {
1705 			count -= retval;
1706 			buf += retval;
1707 			/* warning: weird semantics here, we're reporting a read behind the end of the file */
1708 			progress += retval;
1709 		}
1710 
1711 		unmap_kiobuf(iobuf);
1712 
1713 		if (retval != iosize)
1714 			break;
1715 	}
1716 
1717 	if (progress)
1718 		retval = progress;
1719 
1720  out_free:
1721 	if (!new_iobuf)
1722 		clear_bit(0, &filp->f_iobuf_lock);
1723 	else
1724 		free_kiovec(1, &iobuf);
1725  out:
1726 	return retval;
1727 }
1728 
file_read_actor(read_descriptor_t * desc,struct page * page,unsigned long offset,unsigned long size)1729 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1730 {
1731 	char *kaddr;
1732 	unsigned long left, count = desc->count;
1733 
1734 	if (size > count)
1735 		size = count;
1736 
1737 	kaddr = kmap(page);
1738 	left = __copy_to_user(desc->buf, kaddr + offset, size);
1739 	kunmap(page);
1740 
1741 	if (left) {
1742 		size -= left;
1743 		desc->error = -EFAULT;
1744 	}
1745 	desc->count = count - size;
1746 	desc->written += size;
1747 	desc->buf += size;
1748 	return size;
1749 }
1750 
do_generic_direct_read(struct file * filp,char * buf,size_t count,loff_t * ppos)1751 inline ssize_t do_generic_direct_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1752 {
1753 	ssize_t retval;
1754 	loff_t pos = *ppos;
1755 
1756 	retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1757 	if (retval > 0)
1758 		*ppos = pos + retval;
1759 	return retval;
1760 }
1761 
1762 /*
1763  * This is the "read()" routine for all filesystems
1764  * that can use the page cache directly.
1765  */
generic_file_read(struct file * filp,char * buf,size_t count,loff_t * ppos)1766 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1767 {
1768 	ssize_t retval;
1769 
1770 	if ((ssize_t) count < 0)
1771 		return -EINVAL;
1772 
1773 	if (filp->f_flags & O_DIRECT)
1774 		goto o_direct;
1775 
1776 	retval = -EFAULT;
1777 	if (access_ok(VERIFY_WRITE, buf, count)) {
1778 		retval = 0;
1779 
1780 		if (count) {
1781 			read_descriptor_t desc;
1782 
1783 			desc.written = 0;
1784 			desc.count = count;
1785 			desc.buf = buf;
1786 			desc.error = 0;
1787 			do_generic_file_read(filp, ppos, &desc, file_read_actor);
1788 
1789 			retval = desc.written;
1790 			if (!retval)
1791 				retval = desc.error;
1792 		}
1793 	}
1794  out:
1795 	return retval;
1796 
1797  o_direct:
1798 	{
1799 		loff_t size;
1800 		struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1801 		struct inode *inode = mapping->host;
1802 
1803 		retval = 0;
1804 		if (!count)
1805 			goto out; /* skip atime */
1806 		down_read(&inode->i_alloc_sem);
1807 		down(&inode->i_sem);
1808 		size = inode->i_size;
1809 		if (*ppos < size)
1810 			retval = do_generic_direct_read(filp, buf, count, ppos);
1811 		up(&inode->i_sem);
1812 		up_read(&inode->i_alloc_sem);
1813 		UPDATE_ATIME(filp->f_dentry->d_inode);
1814 		goto out;
1815 	}
1816 }
1817 
file_send_actor(read_descriptor_t * desc,struct page * page,unsigned long offset,unsigned long size)1818 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1819 {
1820 	ssize_t written;
1821 	unsigned long count = desc->count;
1822 	struct file *file = (struct file *) desc->buf;
1823 
1824 	if (size > count)
1825 		size = count;
1826 
1827  	if (file->f_op->sendpage) {
1828  		written = file->f_op->sendpage(file, page, offset,
1829 					       size, &file->f_pos, size<count);
1830 	} else {
1831 		char *kaddr;
1832 		mm_segment_t old_fs;
1833 
1834 		old_fs = get_fs();
1835 		set_fs(KERNEL_DS);
1836 
1837 		kaddr = kmap(page);
1838 		written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1839 		kunmap(page);
1840 
1841 		set_fs(old_fs);
1842 	}
1843 	if (written < 0) {
1844 		desc->error = written;
1845 		written = 0;
1846 	}
1847 	desc->count = count - written;
1848 	desc->written += written;
1849 	return written;
1850 }
1851 
common_sendfile(int out_fd,int in_fd,loff_t * offset,size_t count)1852 static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count)
1853 {
1854 	ssize_t retval;
1855 	struct file * in_file, * out_file;
1856 	struct inode * in_inode, * out_inode;
1857 
1858 	/*
1859 	 * Get input file, and verify that it is ok..
1860 	 */
1861 	retval = -EBADF;
1862 	in_file = fget(in_fd);
1863 	if (!in_file)
1864 		goto out;
1865 	if (!(in_file->f_mode & FMODE_READ))
1866 		goto fput_in;
1867 	retval = -EINVAL;
1868 	in_inode = in_file->f_dentry->d_inode;
1869 	if (!in_inode)
1870 		goto fput_in;
1871 	if (!in_inode->i_mapping->a_ops->readpage)
1872 		goto fput_in;
1873 	retval = rw_verify_area(READ, in_file, &in_file->f_pos, count);
1874 	if (retval)
1875 		goto fput_in;
1876 
1877 	/*
1878 	 * Get output file, and verify that it is ok..
1879 	 */
1880 	retval = -EBADF;
1881 	out_file = fget(out_fd);
1882 	if (!out_file)
1883 		goto fput_in;
1884 	if (!(out_file->f_mode & FMODE_WRITE))
1885 		goto fput_out;
1886 	retval = -EINVAL;
1887 	if (!out_file->f_op || !out_file->f_op->write)
1888 		goto fput_out;
1889 	out_inode = out_file->f_dentry->d_inode;
1890 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
1891 	if (retval)
1892 		goto fput_out;
1893 
1894 	retval = 0;
1895 	if (count) {
1896 		read_descriptor_t desc;
1897 
1898 		if (!offset)
1899 			offset = &in_file->f_pos;
1900 
1901 		desc.written = 0;
1902 		desc.count = count;
1903 		desc.buf = (char *) out_file;
1904 		desc.error = 0;
1905 		do_generic_file_read(in_file, offset, &desc, file_send_actor);
1906 
1907 		retval = desc.written;
1908 		if (!retval)
1909 			retval = desc.error;
1910 	}
1911 
1912 fput_out:
1913 	fput(out_file);
1914 fput_in:
1915 	fput(in_file);
1916 out:
1917 	return retval;
1918 }
1919 
sys_sendfile(int out_fd,int in_fd,off_t * offset,size_t count)1920 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1921 {
1922 	loff_t pos, *ppos = NULL;
1923 	ssize_t ret;
1924 	if (offset) {
1925 		off_t off;
1926 		if (unlikely(get_user(off, offset)))
1927 			return -EFAULT;
1928 		pos = off;
1929 		ppos = &pos;
1930 	}
1931 	ret = common_sendfile(out_fd, in_fd, ppos, count);
1932 	if (offset)
1933 		put_user((off_t)pos, offset);
1934 	return ret;
1935 }
1936 
sys_sendfile64(int out_fd,int in_fd,loff_t * offset,size_t count)1937 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count)
1938 {
1939 	loff_t pos, *ppos = NULL;
1940 	ssize_t ret;
1941 	if (offset) {
1942 		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1943 			return -EFAULT;
1944 		ppos = &pos;
1945 	}
1946 	ret = common_sendfile(out_fd, in_fd, ppos, count);
1947 	if (offset)
1948 		put_user(pos, offset);
1949 	return ret;
1950 }
1951 
do_readahead(struct file * file,unsigned long index,unsigned long nr)1952 static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
1953 {
1954 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1955 	unsigned long max;
1956 
1957 	if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1958 		return -EINVAL;
1959 
1960 	/* Limit it to the size of the file.. */
1961 	max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
1962 	if (index > max)
1963 		return 0;
1964 	max -= index;
1965 	if (nr > max)
1966 		nr = max;
1967 
1968 	/* And limit it to a sane percentage of the inactive list.. */
1969 	max = (nr_free_pages() + nr_inactive_pages) / 2;
1970 	if (nr > max)
1971 		nr = max;
1972 
1973 	while (nr) {
1974 		page_cache_read(file, index);
1975 		index++;
1976 		nr--;
1977 	}
1978 	return 0;
1979 }
1980 
sys_readahead(int fd,loff_t offset,size_t count)1981 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1982 {
1983 	ssize_t ret;
1984 	struct file *file;
1985 
1986 	ret = -EBADF;
1987 	file = fget(fd);
1988 	if (file) {
1989 		if (file->f_mode & FMODE_READ) {
1990 			unsigned long start = offset >> PAGE_CACHE_SHIFT;
1991 			unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
1992 			ret = do_readahead(file, start, len);
1993 		}
1994 		fput(file);
1995 	}
1996 	return ret;
1997 }
1998 
1999 /*
2000  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
2001  * sure this is sequential access, we don't need a flexible read-ahead
2002  * window size -- we can always use a large fixed size window.
2003  */
nopage_sequential_readahead(struct vm_area_struct * vma,unsigned long pgoff,unsigned long filesize)2004 static void nopage_sequential_readahead(struct vm_area_struct * vma,
2005 	unsigned long pgoff, unsigned long filesize)
2006 {
2007 	unsigned long ra_window;
2008 
2009 	ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
2010 	ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
2011 
2012 	/* vm_raend is zero if we haven't read ahead in this area yet.  */
2013 	if (vma->vm_raend == 0)
2014 		vma->vm_raend = vma->vm_pgoff + ra_window;
2015 
2016 	/*
2017 	 * If we've just faulted the page half-way through our window,
2018 	 * then schedule reads for the next window, and release the
2019 	 * pages in the previous window.
2020 	 */
2021 	if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
2022 		unsigned long start = vma->vm_pgoff + vma->vm_raend;
2023 		unsigned long end = start + ra_window;
2024 
2025 		if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
2026 			end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
2027 		if (start > end)
2028 			return;
2029 
2030 		while ((start < end) && (start < filesize)) {
2031 			if (read_cluster_nonblocking(vma->vm_file,
2032 							start, filesize) < 0)
2033 				break;
2034 			start += CLUSTER_PAGES;
2035 		}
2036 		run_task_queue(&tq_disk);
2037 
2038 		/* if we're far enough past the beginning of this area,
2039 		   recycle pages that are in the previous window. */
2040 		if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
2041 			unsigned long window = ra_window << PAGE_SHIFT;
2042 
2043 			end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
2044 			end -= window + window;
2045 			filemap_sync(vma, end - window, window, MS_INVALIDATE);
2046 		}
2047 
2048 		vma->vm_raend += ra_window;
2049 	}
2050 
2051 	return;
2052 }
2053 
2054 /*
2055  * filemap_nopage() is invoked via the vma operations vector for a
2056  * mapped memory region to read in file data during a page fault.
2057  *
2058  * The goto's are kind of ugly, but this streamlines the normal case of having
2059  * it in the page cache, and handles the special cases reasonably without
2060  * having a lot of duplicated code.
2061  */
filemap_nopage(struct vm_area_struct * area,unsigned long address,int unused)2062 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
2063 {
2064 	int error;
2065 	struct file *file = area->vm_file;
2066 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2067 	struct inode *inode = mapping->host;
2068 	struct page *page, **hash;
2069 	unsigned long size, pgoff, endoff;
2070 
2071 	pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2072 	endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2073 
2074 retry_all:
2075 	/*
2076 	 * An external ptracer can access pages that normally aren't
2077 	 * accessible..
2078 	 */
2079 	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2080 	if ((pgoff >= size) && (area->vm_mm == current->mm))
2081 		return NULL;
2082 
2083 	/* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
2084 	if (size > endoff)
2085 		size = endoff;
2086 
2087 	/*
2088 	 * Do we have something in the page cache already?
2089 	 */
2090 	hash = page_hash(mapping, pgoff);
2091 retry_find:
2092 	page = __find_get_page(mapping, pgoff, hash);
2093 	if (!page)
2094 		goto no_cached_page;
2095 
2096 	/*
2097 	 * Ok, found a page in the page cache, now we need to check
2098 	 * that it's up-to-date.
2099 	 */
2100 	if (!Page_Uptodate(page))
2101 		goto page_not_uptodate;
2102 
2103 success:
2104  	/*
2105 	 * Try read-ahead for sequential areas.
2106 	 */
2107 	if (VM_SequentialReadHint(area))
2108 		nopage_sequential_readahead(area, pgoff, size);
2109 
2110 	/*
2111 	 * Found the page and have a reference on it, need to check sharing
2112 	 * and possibly copy it over to another page..
2113 	 */
2114 	mark_page_accessed(page);
2115 	flush_page_to_ram(page);
2116 	return page;
2117 
2118 no_cached_page:
2119 	/*
2120 	 * If the requested offset is within our file, try to read a whole
2121 	 * cluster of pages at once.
2122 	 *
2123 	 * Otherwise, we're off the end of a privately mapped file,
2124 	 * so we need to map a zero page.
2125 	 */
2126 	if ((pgoff < size) && !VM_RandomReadHint(area))
2127 		error = read_cluster_nonblocking(file, pgoff, size);
2128 	else
2129 		error = page_cache_read(file, pgoff);
2130 
2131 	/*
2132 	 * The page we want has now been added to the page cache.
2133 	 * In the unlikely event that someone removed it in the
2134 	 * meantime, we'll just come back here and read it again.
2135 	 */
2136 	if (error >= 0)
2137 		goto retry_find;
2138 
2139 	/*
2140 	 * An error return from page_cache_read can result if the
2141 	 * system is low on memory, or a problem occurs while trying
2142 	 * to schedule I/O.
2143 	 */
2144 	if (error == -ENOMEM)
2145 		return NOPAGE_OOM;
2146 	return NULL;
2147 
2148 page_not_uptodate:
2149 	lock_page(page);
2150 
2151 	/* Did it get unhashed while we waited for it? */
2152 	if (!page->mapping) {
2153 		UnlockPage(page);
2154 		page_cache_release(page);
2155 		goto retry_all;
2156 	}
2157 
2158 	/* Did somebody else get it up-to-date? */
2159 	if (Page_Uptodate(page)) {
2160 		UnlockPage(page);
2161 		goto success;
2162 	}
2163 
2164 	if (!mapping->a_ops->readpage(file, page)) {
2165 		wait_on_page(page);
2166 		if (Page_Uptodate(page))
2167 			goto success;
2168 	}
2169 
2170 	/*
2171 	 * Umm, take care of errors if the page isn't up-to-date.
2172 	 * Try to re-read it _once_. We do this synchronously,
2173 	 * because there really aren't any performance issues here
2174 	 * and we need to check for errors.
2175 	 */
2176 	lock_page(page);
2177 
2178 	/* Somebody truncated the page on us? */
2179 	if (!page->mapping) {
2180 		UnlockPage(page);
2181 		page_cache_release(page);
2182 		goto retry_all;
2183 	}
2184 
2185 	/* Somebody else successfully read it in? */
2186 	if (Page_Uptodate(page)) {
2187 		UnlockPage(page);
2188 		goto success;
2189 	}
2190 	ClearPageError(page);
2191 	if (!mapping->a_ops->readpage(file, page)) {
2192 		wait_on_page(page);
2193 		if (Page_Uptodate(page))
2194 			goto success;
2195 	}
2196 
2197 	/*
2198 	 * Things didn't work out. Return zero to tell the
2199 	 * mm layer so, possibly freeing the page cache page first.
2200 	 */
2201 	page_cache_release(page);
2202 	return NULL;
2203 }
2204 
2205 /* Called with mm->page_table_lock held to protect against other
2206  * threads/the swapper from ripping pte's out from under us.
2207  */
filemap_sync_pte(pte_t * ptep,struct vm_area_struct * vma,unsigned long address,unsigned int flags)2208 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
2209 	unsigned long address, unsigned int flags)
2210 {
2211 	pte_t pte = *ptep;
2212 
2213 	if (pte_present(pte)) {
2214 		struct page *page = pte_page(pte);
2215 		if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
2216 			flush_tlb_page(vma, address);
2217 			set_page_dirty(page);
2218 		}
2219 	}
2220 	return 0;
2221 }
2222 
filemap_sync_pte_range(pmd_t * pmd,unsigned long address,unsigned long size,struct vm_area_struct * vma,unsigned long offset,unsigned int flags)2223 static inline int filemap_sync_pte_range(pmd_t * pmd,
2224 	unsigned long address, unsigned long size,
2225 	struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
2226 {
2227 	pte_t * pte;
2228 	unsigned long end;
2229 	int error;
2230 
2231 	if (pmd_none(*pmd))
2232 		return 0;
2233 	if (pmd_bad(*pmd)) {
2234 		pmd_ERROR(*pmd);
2235 		pmd_clear(pmd);
2236 		return 0;
2237 	}
2238 	pte = pte_offset(pmd, address);
2239 	offset += address & PMD_MASK;
2240 	address &= ~PMD_MASK;
2241 	end = address + size;
2242 	if (end > PMD_SIZE)
2243 		end = PMD_SIZE;
2244 	error = 0;
2245 	do {
2246 		error |= filemap_sync_pte(pte, vma, address + offset, flags);
2247 		address += PAGE_SIZE;
2248 		pte++;
2249 	} while (address && (address < end));
2250 	return error;
2251 }
2252 
filemap_sync_pmd_range(pgd_t * pgd,unsigned long address,unsigned long size,struct vm_area_struct * vma,unsigned int flags)2253 static inline int filemap_sync_pmd_range(pgd_t * pgd,
2254 	unsigned long address, unsigned long size,
2255 	struct vm_area_struct *vma, unsigned int flags)
2256 {
2257 	pmd_t * pmd;
2258 	unsigned long offset, end;
2259 	int error;
2260 
2261 	if (pgd_none(*pgd))
2262 		return 0;
2263 	if (pgd_bad(*pgd)) {
2264 		pgd_ERROR(*pgd);
2265 		pgd_clear(pgd);
2266 		return 0;
2267 	}
2268 	pmd = pmd_offset(pgd, address);
2269 	offset = address & PGDIR_MASK;
2270 	address &= ~PGDIR_MASK;
2271 	end = address + size;
2272 	if (end > PGDIR_SIZE)
2273 		end = PGDIR_SIZE;
2274 	error = 0;
2275 	do {
2276 		error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
2277 		address = (address + PMD_SIZE) & PMD_MASK;
2278 		pmd++;
2279 	} while (address && (address < end));
2280 	return error;
2281 }
2282 
filemap_sync(struct vm_area_struct * vma,unsigned long address,size_t size,unsigned int flags)2283 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
2284 	size_t size, unsigned int flags)
2285 {
2286 	pgd_t * dir;
2287 	unsigned long end = address + size;
2288 	int error = 0;
2289 
2290 	/* Aquire the lock early; it may be possible to avoid dropping
2291 	 * and reaquiring it repeatedly.
2292 	 */
2293 	spin_lock(&vma->vm_mm->page_table_lock);
2294 
2295 	dir = pgd_offset(vma->vm_mm, address);
2296 	flush_cache_range(vma->vm_mm, end - size, end);
2297 	if (address >= end)
2298 		BUG();
2299 	do {
2300 		error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
2301 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
2302 		dir++;
2303 	} while (address && (address < end));
2304 	flush_tlb_range(vma->vm_mm, end - size, end);
2305 
2306 	spin_unlock(&vma->vm_mm->page_table_lock);
2307 
2308 	return error;
2309 }
2310 
2311 static struct vm_operations_struct generic_file_vm_ops = {
2312 	nopage:		filemap_nopage,
2313 };
2314 
2315 /* This is used for a general mmap of a disk file */
2316 
generic_file_mmap(struct file * file,struct vm_area_struct * vma)2317 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2318 {
2319 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2320 	struct inode *inode = mapping->host;
2321 
2322 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
2323 		if (!mapping->a_ops->writepage)
2324 			return -EINVAL;
2325 	}
2326 	if (!mapping->a_ops->readpage)
2327 		return -ENOEXEC;
2328 	UPDATE_ATIME(inode);
2329 	vma->vm_ops = &generic_file_vm_ops;
2330 	return 0;
2331 }
2332 
2333 /*
2334  * The msync() system call.
2335  */
2336 
2337 /*
2338  * MS_SYNC syncs the entire file - including mappings.
2339  *
2340  * MS_ASYNC initiates writeout of just the dirty mapped data.
2341  * This provides no guarantee of file integrity - things like indirect
2342  * blocks may not have started writeout.  MS_ASYNC is primarily useful
2343  * where the application knows that it has finished with the data and
2344  * wishes to intelligently schedule its own I/O traffic.
2345  */
msync_interval(struct vm_area_struct * vma,unsigned long start,unsigned long end,int flags)2346 static int msync_interval(struct vm_area_struct * vma,
2347 	unsigned long start, unsigned long end, int flags)
2348 {
2349 	int ret = 0;
2350 	struct file * file = vma->vm_file;
2351 
2352 	if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
2353 		return -EBUSY;
2354 
2355 	if (file && (vma->vm_flags & VM_SHARED)) {
2356 		ret = filemap_sync(vma, start, end-start, flags);
2357 
2358 		if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
2359 			struct inode * inode = file->f_dentry->d_inode;
2360 
2361 			down(&inode->i_sem);
2362 			ret = filemap_fdatasync(inode->i_mapping);
2363 			if (flags & MS_SYNC) {
2364 				int err;
2365 
2366 				if (file->f_op && file->f_op->fsync) {
2367 					err = file->f_op->fsync(file, file->f_dentry, 1);
2368 					if (err && !ret)
2369 						ret = err;
2370 				}
2371 				err = filemap_fdatawait(inode->i_mapping);
2372 				if (err && !ret)
2373 					ret = err;
2374 			}
2375 			up(&inode->i_sem);
2376 		}
2377 	}
2378 	return ret;
2379 }
2380 
sys_msync(unsigned long start,size_t len,int flags)2381 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
2382 {
2383 	unsigned long end;
2384 	struct vm_area_struct * vma;
2385 	int unmapped_error, error = -EINVAL;
2386 
2387 	down_read(&current->mm->mmap_sem);
2388 	if (start & ~PAGE_MASK)
2389 		goto out;
2390 	len = (len + ~PAGE_MASK) & PAGE_MASK;
2391 	end = start + len;
2392 	if (end < start)
2393 		goto out;
2394 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
2395 		goto out;
2396 	if ((flags & MS_ASYNC) && (flags & MS_SYNC))
2397 		goto out;
2398 
2399 	error = 0;
2400 	if (end == start)
2401 		goto out;
2402 	/*
2403 	 * If the interval [start,end) covers some unmapped address ranges,
2404 	 * just ignore them, but return -ENOMEM at the end.
2405 	 */
2406 	vma = find_vma(current->mm, start);
2407 	unmapped_error = 0;
2408 	for (;;) {
2409 		/* Still start < end. */
2410 		error = -ENOMEM;
2411 		if (!vma)
2412 			goto out;
2413 		/* Here start < vma->vm_end. */
2414 		if (start < vma->vm_start) {
2415 			unmapped_error = -ENOMEM;
2416 			start = vma->vm_start;
2417 		}
2418 		/* Here vma->vm_start <= start < vma->vm_end. */
2419 		if (end <= vma->vm_end) {
2420 			if (start < end) {
2421 				error = msync_interval(vma, start, end, flags);
2422 				if (error)
2423 					goto out;
2424 			}
2425 			error = unmapped_error;
2426 			goto out;
2427 		}
2428 		/* Here vma->vm_start <= start < vma->vm_end < end. */
2429 		error = msync_interval(vma, start, vma->vm_end, flags);
2430 		if (error)
2431 			goto out;
2432 		start = vma->vm_end;
2433 		vma = vma->vm_next;
2434 	}
2435 out:
2436 	up_read(&current->mm->mmap_sem);
2437 	return error;
2438 }
2439 
setup_read_behavior(struct vm_area_struct * vma,int behavior)2440 static inline void setup_read_behavior(struct vm_area_struct * vma,
2441 	int behavior)
2442 {
2443 	VM_ClearReadHint(vma);
2444 	switch(behavior) {
2445 		case MADV_SEQUENTIAL:
2446 			vma->vm_flags |= VM_SEQ_READ;
2447 			break;
2448 		case MADV_RANDOM:
2449 			vma->vm_flags |= VM_RAND_READ;
2450 			break;
2451 		default:
2452 			break;
2453 	}
2454 	return;
2455 }
2456 
madvise_fixup_start(struct vm_area_struct * vma,unsigned long end,int behavior)2457 static long madvise_fixup_start(struct vm_area_struct * vma,
2458 	unsigned long end, int behavior)
2459 {
2460 	struct vm_area_struct * n;
2461 	struct mm_struct * mm = vma->vm_mm;
2462 
2463 	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2464 	if (!n)
2465 		return -EAGAIN;
2466 	*n = *vma;
2467 	n->vm_end = end;
2468 	setup_read_behavior(n, behavior);
2469 	n->vm_raend = 0;
2470 	if (n->vm_file)
2471 		get_file(n->vm_file);
2472 	if (n->vm_ops && n->vm_ops->open)
2473 		n->vm_ops->open(n);
2474 	vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2475 	lock_vma_mappings(vma);
2476 	spin_lock(&mm->page_table_lock);
2477 	vma->vm_start = end;
2478 	__insert_vm_struct(mm, n);
2479 	spin_unlock(&mm->page_table_lock);
2480 	unlock_vma_mappings(vma);
2481 	return 0;
2482 }
2483 
madvise_fixup_end(struct vm_area_struct * vma,unsigned long start,int behavior)2484 static long madvise_fixup_end(struct vm_area_struct * vma,
2485 	unsigned long start, int behavior)
2486 {
2487 	struct vm_area_struct * n;
2488 	struct mm_struct * mm = vma->vm_mm;
2489 
2490 	n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2491 	if (!n)
2492 		return -EAGAIN;
2493 	*n = *vma;
2494 	n->vm_start = start;
2495 	n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2496 	setup_read_behavior(n, behavior);
2497 	n->vm_raend = 0;
2498 	if (n->vm_file)
2499 		get_file(n->vm_file);
2500 	if (n->vm_ops && n->vm_ops->open)
2501 		n->vm_ops->open(n);
2502 	lock_vma_mappings(vma);
2503 	spin_lock(&mm->page_table_lock);
2504 	vma->vm_end = start;
2505 	__insert_vm_struct(mm, n);
2506 	spin_unlock(&mm->page_table_lock);
2507 	unlock_vma_mappings(vma);
2508 	return 0;
2509 }
2510 
madvise_fixup_middle(struct vm_area_struct * vma,unsigned long start,unsigned long end,int behavior)2511 static long madvise_fixup_middle(struct vm_area_struct * vma,
2512 	unsigned long start, unsigned long end, int behavior)
2513 {
2514 	struct vm_area_struct * left, * right;
2515 	struct mm_struct * mm = vma->vm_mm;
2516 
2517 	left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2518 	if (!left)
2519 		return -EAGAIN;
2520 	right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2521 	if (!right) {
2522 		kmem_cache_free(vm_area_cachep, left);
2523 		return -EAGAIN;
2524 	}
2525 	*left = *vma;
2526 	*right = *vma;
2527 	left->vm_end = start;
2528 	right->vm_start = end;
2529 	right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2530 	left->vm_raend = 0;
2531 	right->vm_raend = 0;
2532 	if (vma->vm_file)
2533 		atomic_add(2, &vma->vm_file->f_count);
2534 
2535 	if (vma->vm_ops && vma->vm_ops->open) {
2536 		vma->vm_ops->open(left);
2537 		vma->vm_ops->open(right);
2538 	}
2539 	vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2540 	vma->vm_raend = 0;
2541 	lock_vma_mappings(vma);
2542 	spin_lock(&mm->page_table_lock);
2543 	vma->vm_start = start;
2544 	vma->vm_end = end;
2545 	setup_read_behavior(vma, behavior);
2546 	__insert_vm_struct(mm, left);
2547 	__insert_vm_struct(mm, right);
2548 	spin_unlock(&mm->page_table_lock);
2549 	unlock_vma_mappings(vma);
2550 	return 0;
2551 }
2552 
2553 /*
2554  * We can potentially split a vm area into separate
2555  * areas, each area with its own behavior.
2556  */
madvise_behavior(struct vm_area_struct * vma,unsigned long start,unsigned long end,int behavior)2557 static long madvise_behavior(struct vm_area_struct * vma,
2558 	unsigned long start, unsigned long end, int behavior)
2559 {
2560 	int error = 0;
2561 
2562 	/* This caps the number of vma's this process can own */
2563 	if (vma->vm_mm->map_count > max_map_count)
2564 		return -ENOMEM;
2565 
2566 	if (start == vma->vm_start) {
2567 		if (end == vma->vm_end) {
2568 			setup_read_behavior(vma, behavior);
2569 			vma->vm_raend = 0;
2570 		} else
2571 			error = madvise_fixup_start(vma, end, behavior);
2572 	} else {
2573 		if (end == vma->vm_end)
2574 			error = madvise_fixup_end(vma, start, behavior);
2575 		else
2576 			error = madvise_fixup_middle(vma, start, end, behavior);
2577 	}
2578 
2579 	return error;
2580 }
2581 
2582 /*
2583  * Schedule all required I/O operations, then run the disk queue
2584  * to make sure they are started.  Do not wait for completion.
2585  */
madvise_willneed(struct vm_area_struct * vma,unsigned long start,unsigned long end)2586 static long madvise_willneed(struct vm_area_struct * vma,
2587 	unsigned long start, unsigned long end)
2588 {
2589 	long error = -EBADF;
2590 	struct file * file;
2591 	struct inode * inode;
2592 	unsigned long size;
2593 
2594 	/* Doesn't work if there's no mapped file. */
2595 	if (!vma->vm_file)
2596 		return error;
2597 	file = vma->vm_file;
2598 	inode = file->f_dentry->d_inode;
2599 	if (!inode->i_mapping->a_ops->readpage)
2600 		return error;
2601 	size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2602 
2603 	start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2604 	if (end > vma->vm_end)
2605 		end = vma->vm_end;
2606 	end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2607 
2608 	error = -EIO;
2609 
2610 	/* round to cluster boundaries if this isn't a "random" area. */
2611 	if (!VM_RandomReadHint(vma)) {
2612 		start = CLUSTER_OFFSET(start);
2613 		end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2614 
2615 		while ((start < end) && (start < size)) {
2616 			error = read_cluster_nonblocking(file, start, size);
2617 			start += CLUSTER_PAGES;
2618 			if (error < 0)
2619 				break;
2620 		}
2621 	} else {
2622 		while ((start < end) && (start < size)) {
2623 			error = page_cache_read(file, start);
2624 			start++;
2625 			if (error < 0)
2626 				break;
2627 		}
2628 	}
2629 
2630 	/* Don't wait for someone else to push these requests. */
2631 	run_task_queue(&tq_disk);
2632 
2633 	return error;
2634 }
2635 
2636 /*
2637  * Application no longer needs these pages.  If the pages are dirty,
2638  * it's OK to just throw them away.  The app will be more careful about
2639  * data it wants to keep.  Be sure to free swap resources too.  The
2640  * zap_page_range call sets things up for refill_inactive to actually free
2641  * these pages later if no one else has touched them in the meantime,
2642  * although we could add these pages to a global reuse list for
2643  * refill_inactive to pick up before reclaiming other pages.
2644  *
2645  * NB: This interface discards data rather than pushes it out to swap,
2646  * as some implementations do.  This has performance implications for
2647  * applications like large transactional databases which want to discard
2648  * pages in anonymous maps after committing to backing store the data
2649  * that was kept in them.  There is no reason to write this data out to
2650  * the swap area if the application is discarding it.
2651  *
2652  * An interface that causes the system to free clean pages and flush
2653  * dirty pages is already available as msync(MS_INVALIDATE).
2654  */
madvise_dontneed(struct vm_area_struct * vma,unsigned long start,unsigned long end)2655 static long madvise_dontneed(struct vm_area_struct * vma,
2656 	unsigned long start, unsigned long end)
2657 {
2658 	if (vma->vm_flags & VM_LOCKED)
2659 		return -EINVAL;
2660 
2661 	zap_page_range(vma->vm_mm, start, end - start);
2662 	return 0;
2663 }
2664 
madvise_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end,int behavior)2665 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2666 	unsigned long end, int behavior)
2667 {
2668 	long error = -EBADF;
2669 
2670 	switch (behavior) {
2671 	case MADV_NORMAL:
2672 	case MADV_SEQUENTIAL:
2673 	case MADV_RANDOM:
2674 		error = madvise_behavior(vma, start, end, behavior);
2675 		break;
2676 
2677 	case MADV_WILLNEED:
2678 		error = madvise_willneed(vma, start, end);
2679 		break;
2680 
2681 	case MADV_DONTNEED:
2682 		error = madvise_dontneed(vma, start, end);
2683 		break;
2684 
2685 	default:
2686 		error = -EINVAL;
2687 		break;
2688 	}
2689 
2690 	return error;
2691 }
2692 
2693 /*
2694  * The madvise(2) system call.
2695  *
2696  * Applications can use madvise() to advise the kernel how it should
2697  * handle paging I/O in this VM area.  The idea is to help the kernel
2698  * use appropriate read-ahead and caching techniques.  The information
2699  * provided is advisory only, and can be safely disregarded by the
2700  * kernel without affecting the correct operation of the application.
2701  *
2702  * behavior values:
2703  *  MADV_NORMAL - the default behavior is to read clusters.  This
2704  *		results in some read-ahead and read-behind.
2705  *  MADV_RANDOM - the system should read the minimum amount of data
2706  *		on any access, since it is unlikely that the appli-
2707  *		cation will need more than what it asks for.
2708  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2709  *		once, so they can be aggressively read ahead, and
2710  *		can be freed soon after they are accessed.
2711  *  MADV_WILLNEED - the application is notifying the system to read
2712  *		some pages ahead.
2713  *  MADV_DONTNEED - the application is finished with the given range,
2714  *		so the kernel can free resources associated with it.
2715  *
2716  * return values:
2717  *  zero    - success
2718  *  -EINVAL - start + len < 0, start is not page-aligned,
2719  *		"behavior" is not a valid value, or application
2720  *		is attempting to release locked or shared pages.
2721  *  -ENOMEM - addresses in the specified range are not currently
2722  *		mapped, or are outside the AS of the process.
2723  *  -EIO    - an I/O error occurred while paging in data.
2724  *  -EBADF  - map exists, but area maps something that isn't a file.
2725  *  -EAGAIN - a kernel resource was temporarily unavailable.
2726  */
sys_madvise(unsigned long start,size_t len,int behavior)2727 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2728 {
2729 	unsigned long end;
2730 	struct vm_area_struct * vma;
2731 	int unmapped_error = 0;
2732 	int error = -EINVAL;
2733 
2734 	down_write(&current->mm->mmap_sem);
2735 
2736 	if (start & ~PAGE_MASK)
2737 		goto out;
2738 	len = (len + ~PAGE_MASK) & PAGE_MASK;
2739 	end = start + len;
2740 	if (end < start)
2741 		goto out;
2742 
2743 	error = 0;
2744 	if (end == start)
2745 		goto out;
2746 
2747 	/*
2748 	 * If the interval [start,end) covers some unmapped address
2749 	 * ranges, just ignore them, but return -ENOMEM at the end.
2750 	 */
2751 	vma = find_vma(current->mm, start);
2752 	for (;;) {
2753 		/* Still start < end. */
2754 		error = -ENOMEM;
2755 		if (!vma)
2756 			goto out;
2757 
2758 		/* Here start < vma->vm_end. */
2759 		if (start < vma->vm_start) {
2760 			unmapped_error = -ENOMEM;
2761 			start = vma->vm_start;
2762 		}
2763 
2764 		/* Here vma->vm_start <= start < vma->vm_end. */
2765 		if (end <= vma->vm_end) {
2766 			if (start < end) {
2767 				error = madvise_vma(vma, start, end,
2768 							behavior);
2769 				if (error)
2770 					goto out;
2771 			}
2772 			error = unmapped_error;
2773 			goto out;
2774 		}
2775 
2776 		/* Here vma->vm_start <= start < vma->vm_end < end. */
2777 		error = madvise_vma(vma, start, vma->vm_end, behavior);
2778 		if (error)
2779 			goto out;
2780 		start = vma->vm_end;
2781 		vma = vma->vm_next;
2782 	}
2783 
2784 out:
2785 	up_write(&current->mm->mmap_sem);
2786 	return error;
2787 }
2788 
2789 /*
2790  * Later we can get more picky about what "in core" means precisely.
2791  * For now, simply check to see if the page is in the page cache,
2792  * and is up to date; i.e. that no page-in operation would be required
2793  * at this time if an application were to map and access this page.
2794  */
mincore_page(struct vm_area_struct * vma,unsigned long pgoff)2795 static unsigned char mincore_page(struct vm_area_struct * vma,
2796 	unsigned long pgoff)
2797 {
2798 	unsigned char present = 0;
2799 	struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
2800 	struct page * page, ** hash = page_hash(as, pgoff);
2801 
2802 	spin_lock(&pagecache_lock);
2803 	page = __find_page_nolock(as, pgoff, *hash);
2804 	if ((page) && (Page_Uptodate(page)))
2805 		present = 1;
2806 	spin_unlock(&pagecache_lock);
2807 
2808 	return present;
2809 }
2810 
2811 /*
2812  * Do a chunk of "sys_mincore()". We've already checked
2813  * all the arguments, we hold the mmap semaphore: we should
2814  * just return the amount of info we're asked for.
2815  */
do_mincore(unsigned long addr,unsigned char * vec,unsigned long pages)2816 static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
2817 {
2818 	unsigned long i, nr, pgoff;
2819 	struct vm_area_struct *vma = find_vma(current->mm, addr);
2820 
2821 	/*
2822 	 * find_vma() didn't find anything above us, or we're
2823 	 * in an unmapped hole in the address space: ENOMEM.
2824 	 */
2825 	if (!vma || addr < vma->vm_start)
2826 		return -ENOMEM;
2827 
2828 	/*
2829 	 * Ok, got it. But check whether it's a segment we support
2830 	 * mincore() on. Right now, we don't do any anonymous mappings.
2831 	 *
2832 	 * FIXME: This is just stupid. And returning ENOMEM is
2833 	 * stupid too. We should just look at the page tables. But
2834 	 * this is what we've traditionally done, so we'll just
2835 	 * continue doing it.
2836 	 */
2837 	if (!vma->vm_file)
2838 		return -ENOMEM;
2839 
2840 	/*
2841 	 * Calculate how many pages there are left in the vma, and
2842 	 * what the pgoff is for our address.
2843 	 */
2844 	nr = (vma->vm_end - addr) >> PAGE_SHIFT;
2845 	if (nr > pages)
2846 		nr = pages;
2847 
2848 	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
2849 	pgoff += vma->vm_pgoff;
2850 
2851 	/* And then we just fill the sucker in.. */
2852 	for (i = 0 ; i < nr; i++, pgoff++)
2853 		vec[i] = mincore_page(vma, pgoff);
2854 
2855 	return nr;
2856 }
2857 
2858 /*
2859  * The mincore(2) system call.
2860  *
2861  * mincore() returns the memory residency status of the pages in the
2862  * current process's address space specified by [addr, addr + len).
2863  * The status is returned in a vector of bytes.  The least significant
2864  * bit of each byte is 1 if the referenced page is in memory, otherwise
2865  * it is zero.
2866  *
2867  * Because the status of a page can change after mincore() checks it
2868  * but before it returns to the application, the returned vector may
2869  * contain stale information.  Only locked pages are guaranteed to
2870  * remain in memory.
2871  *
2872  * return values:
2873  *  zero    - success
2874  *  -EFAULT - vec points to an illegal address
2875  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
2876  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2877  *		invalid for the address space of this process, or
2878  *		specify one or more pages which are not currently
2879  *		mapped
2880  *  -EAGAIN - A kernel resource was temporarily unavailable.
2881  */
sys_mincore(unsigned long start,size_t len,unsigned char * vec)2882 asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char *vec)
2883 {
2884 	long retval;
2885 	unsigned long pages;
2886 	unsigned char *tmp;
2887 
2888 	/* Check the start address: needs to be page-aligned.. */
2889  	if (start & ~PAGE_CACHE_MASK)
2890 		return -EINVAL;
2891 
2892 	/* ..and we need to be passed a valid user-space range */
2893 	if (!access_ok(VERIFY_READ, (void *) start, len))
2894 		return -ENOMEM;
2895 
2896 	/* This also avoids any overflows on PAGE_CACHE_ALIGN */
2897 	pages = len >> PAGE_SHIFT;
2898 	pages += (len & ~PAGE_MASK) != 0;
2899 
2900 	if (!access_ok(VERIFY_WRITE, vec, pages))
2901 		return -EFAULT;
2902 
2903 	tmp = (void *) __get_free_page(GFP_USER);
2904 	if (!tmp)
2905 		return -EAGAIN;
2906 
2907 	retval = 0;
2908 	while (pages) {
2909 		/*
2910 		 * Do at most PAGE_SIZE entries per iteration, due to
2911 		 * the temporary buffer size.
2912 		 */
2913 		down_read(&current->mm->mmap_sem);
2914 		retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
2915 		up_read(&current->mm->mmap_sem);
2916 
2917 		if (retval <= 0)
2918 			break;
2919 		if (copy_to_user(vec, tmp, retval)) {
2920 			retval = -EFAULT;
2921 			break;
2922 		}
2923 		pages -= retval;
2924 		vec += retval;
2925 		start += retval << PAGE_SHIFT;
2926 		retval = 0;
2927 	}
2928 	free_page((unsigned long) tmp);
2929 	return retval;
2930 }
2931 
2932 static inline
__read_cache_page(struct address_space * mapping,unsigned long index,int (* filler)(void *,struct page *),void * data)2933 struct page *__read_cache_page(struct address_space *mapping,
2934 				unsigned long index,
2935 				int (*filler)(void *,struct page*),
2936 				void *data)
2937 {
2938 	struct page **hash = page_hash(mapping, index);
2939 	struct page *page, *cached_page = NULL;
2940 	int err;
2941 repeat:
2942 	page = __find_get_page(mapping, index, hash);
2943 	if (!page) {
2944 		if (!cached_page) {
2945 			cached_page = page_cache_alloc(mapping);
2946 			if (!cached_page)
2947 				return ERR_PTR(-ENOMEM);
2948 		}
2949 		page = cached_page;
2950 		if (add_to_page_cache_unique(page, mapping, index, hash))
2951 			goto repeat;
2952 		cached_page = NULL;
2953 		err = filler(data, page);
2954 		if (err < 0) {
2955 			page_cache_release(page);
2956 			page = ERR_PTR(err);
2957 		}
2958 	}
2959 	if (cached_page)
2960 		page_cache_release(cached_page);
2961 	return page;
2962 }
2963 
2964 /*
2965  * Read into the page cache. If a page already exists,
2966  * and Page_Uptodate() is not set, try to fill the page.
2967  */
read_cache_page(struct address_space * mapping,unsigned long index,int (* filler)(void *,struct page *),void * data)2968 struct page *read_cache_page(struct address_space *mapping,
2969 				unsigned long index,
2970 				int (*filler)(void *,struct page*),
2971 				void *data)
2972 {
2973 	struct page *page;
2974 	int err;
2975 
2976 retry:
2977 	page = __read_cache_page(mapping, index, filler, data);
2978 	if (IS_ERR(page))
2979 		goto out;
2980 	mark_page_accessed(page);
2981 	if (Page_Uptodate(page))
2982 		goto out;
2983 
2984 	lock_page(page);
2985 	if (!page->mapping) {
2986 		UnlockPage(page);
2987 		page_cache_release(page);
2988 		goto retry;
2989 	}
2990 	if (Page_Uptodate(page)) {
2991 		UnlockPage(page);
2992 		goto out;
2993 	}
2994 	err = filler(data, page);
2995 	if (err < 0) {
2996 		page_cache_release(page);
2997 		page = ERR_PTR(err);
2998 	}
2999  out:
3000 	return page;
3001 }
3002 
__grab_cache_page(struct address_space * mapping,unsigned long index,struct page ** cached_page)3003 static inline struct page * __grab_cache_page(struct address_space *mapping,
3004 				unsigned long index, struct page **cached_page)
3005 {
3006 	struct page *page, **hash = page_hash(mapping, index);
3007 repeat:
3008 	page = __find_lock_page(mapping, index, hash);
3009 	if (!page) {
3010 		if (!*cached_page) {
3011 			*cached_page = page_cache_alloc(mapping);
3012 			if (!*cached_page)
3013 				return NULL;
3014 		}
3015 		page = *cached_page;
3016 		if (add_to_page_cache_unique(page, mapping, index, hash))
3017 			goto repeat;
3018 		*cached_page = NULL;
3019 	}
3020 	return page;
3021 }
3022 
remove_suid(struct inode * inode)3023 inline void remove_suid(struct inode *inode)
3024 {
3025 	unsigned int mode;
3026 
3027 	/* set S_IGID if S_IXGRP is set, and always set S_ISUID */
3028 	mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
3029 
3030 	/* was any of the uid bits set? */
3031 	mode &= inode->i_mode;
3032 	if (mode && !capable(CAP_FSETID)) {
3033 		inode->i_mode &= ~mode;
3034 		mark_inode_dirty(inode);
3035 	}
3036 }
3037 
3038 /*
3039  * precheck_file_write():
3040  * Check the conditions on a file descriptor prior to beginning a write
3041  * on it.  Contains the common precheck code for both buffered and direct
3042  * IO.
3043  */
precheck_file_write(struct file * file,struct inode * inode,size_t * count,loff_t * ppos)3044 int precheck_file_write(struct file *file, struct inode *inode,
3045 			size_t *count, loff_t *ppos)
3046 {
3047 	ssize_t		err;
3048 	unsigned long	limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
3049 	loff_t		pos = *ppos;
3050 
3051 	err = -EINVAL;
3052 	if (pos < 0)
3053 		goto out;
3054 
3055 	err = file->f_error;
3056 	if (err) {
3057 		file->f_error = 0;
3058 		goto out;
3059 	}
3060 
3061 	/* FIXME: this is for backwards compatibility with 2.4 */
3062 	if (!S_ISBLK(inode->i_mode) && (file->f_flags & O_APPEND))
3063 		*ppos = pos = inode->i_size;
3064 
3065 	/*
3066 	 * Check whether we've reached the file size limit.
3067 	 */
3068 	err = -EFBIG;
3069 
3070 	if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
3071 		if (pos >= limit) {
3072 			send_sig(SIGXFSZ, current, 0);
3073 			goto out;
3074 		}
3075 		if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
3076 			/* send_sig(SIGXFSZ, current, 0); */
3077 			*count = limit - (u32)pos;
3078 		}
3079 	}
3080 
3081 	/*
3082 	 *	LFS rule
3083 	 */
3084 	if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
3085 		if (pos >= MAX_NON_LFS) {
3086 			send_sig(SIGXFSZ, current, 0);
3087 			goto out;
3088 		}
3089 		if (*count > MAX_NON_LFS - (u32)pos) {
3090 			/* send_sig(SIGXFSZ, current, 0); */
3091 			*count = MAX_NON_LFS - (u32)pos;
3092 		}
3093 	}
3094 
3095 	/*
3096 	 *	Are we about to exceed the fs block limit ?
3097 	 *
3098 	 *	If we have written data it becomes a short write
3099 	 *	If we have exceeded without writing data we send
3100 	 *	a signal and give them an EFBIG.
3101 	 *
3102 	 *	Linus frestrict idea will clean these up nicely..
3103 	 */
3104 
3105 	if (!S_ISBLK(inode->i_mode)) {
3106 		if (pos >= inode->i_sb->s_maxbytes)
3107 		{
3108 			if (*count || pos > inode->i_sb->s_maxbytes) {
3109 				send_sig(SIGXFSZ, current, 0);
3110 				err = -EFBIG;
3111 				goto out;
3112 			}
3113 			/* zero-length writes at ->s_maxbytes are OK */
3114 		}
3115 
3116 		if (pos + *count > inode->i_sb->s_maxbytes)
3117 			*count = inode->i_sb->s_maxbytes - pos;
3118 	} else {
3119 		if (is_read_only(inode->i_rdev)) {
3120 			err = -EPERM;
3121 			goto out;
3122 		}
3123 		if (pos >= inode->i_size) {
3124 			if (*count || pos > inode->i_size) {
3125 				err = -ENOSPC;
3126 				goto out;
3127 			}
3128 		}
3129 
3130 		if (pos + *count > inode->i_size)
3131 			*count = inode->i_size - pos;
3132 	}
3133 
3134 	err = 0;
3135 out:
3136 	return err;
3137 }
3138 
3139 /*
3140  * Write to a file through the page cache.
3141  *
3142  * We currently put everything into the page cache prior to writing it.
3143  * This is not a problem when writing full pages. With partial pages,
3144  * however, we first have to read the data into the cache, then
3145  * dirty the page, and finally schedule it for writing. Alternatively, we
3146  * could write-through just the portion of data that would go into that
3147  * page, but that would kill performance for applications that write data
3148  * line by line, and it's prone to race conditions.
3149  *
3150  * Note that this routine doesn't try to keep track of dirty pages. Each
3151  * file system has to do this all by itself, unfortunately.
3152  *							okir@monad.swb.de
3153  */
3154 ssize_t
do_generic_file_write(struct file * file,const char * buf,size_t count,loff_t * ppos)3155 do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3156 {
3157 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3158 	struct inode	*inode = mapping->host;
3159 	loff_t		pos;
3160 	struct page	*page, *cached_page;
3161 	ssize_t		written;
3162 	long		status = 0;
3163 	ssize_t		err;
3164 	unsigned	bytes;
3165 
3166 	cached_page = NULL;
3167 	pos = *ppos;
3168 	written = 0;
3169 
3170 	err = precheck_file_write(file, inode, &count, &pos);
3171 	if (err != 0 || count == 0)
3172 		goto out;
3173 
3174 	remove_suid(inode);
3175 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3176 	mark_inode_dirty_sync(inode);
3177 
3178 	do {
3179 		unsigned long index, offset;
3180 		long page_fault;
3181 		char *kaddr;
3182 
3183 		/*
3184 		 * Try to find the page in the cache. If it isn't there,
3185 		 * allocate a free page.
3186 		 */
3187 		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
3188 		index = pos >> PAGE_CACHE_SHIFT;
3189 		bytes = PAGE_CACHE_SIZE - offset;
3190 		if (bytes > count)
3191 			bytes = count;
3192 
3193 		/*
3194 		 * Bring in the user page that we will copy from _first_.
3195 		 * Otherwise there's a nasty deadlock on copying from the
3196 		 * same page as we're writing to, without it being marked
3197 		 * up-to-date.
3198 		 */
3199 		{ volatile unsigned char dummy;
3200 			__get_user(dummy, buf);
3201 			__get_user(dummy, buf+bytes-1);
3202 		}
3203 
3204 		status = -ENOMEM;	/* we'll assign it later anyway */
3205 		page = __grab_cache_page(mapping, index, &cached_page);
3206 		if (!page)
3207 			break;
3208 
3209 		/* We have exclusive IO access to the page.. */
3210 		if (!PageLocked(page)) {
3211 			PAGE_BUG(page);
3212 		}
3213 
3214 		kaddr = kmap(page);
3215 		status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
3216 		if (status)
3217 			goto sync_failure;
3218 		page_fault = __copy_from_user(kaddr+offset, buf, bytes);
3219 		flush_dcache_page(page);
3220 		status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
3221 		if (page_fault)
3222 			goto fail_write;
3223 		if (!status)
3224 			status = bytes;
3225 
3226 		if (status >= 0) {
3227 			written += status;
3228 			count -= status;
3229 			pos += status;
3230 			buf += status;
3231 		}
3232 unlock:
3233 		kunmap(page);
3234 		/* Mark it unlocked again and drop the page.. */
3235 		SetPageReferenced(page);
3236 		UnlockPage(page);
3237 		page_cache_release(page);
3238 
3239 		if (status < 0)
3240 			break;
3241 	} while (count);
3242 done:
3243 	*ppos = pos;
3244 
3245 	if (cached_page)
3246 		page_cache_release(cached_page);
3247 
3248 	/* For now, when the user asks for O_SYNC, we'll actually
3249 	 * provide O_DSYNC. */
3250 	if (status >= 0) {
3251 		if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
3252 			status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
3253 	}
3254 
3255 	err = written ? written : status;
3256 out:
3257 
3258 	return err;
3259 fail_write:
3260 	status = -EFAULT;
3261 	goto unlock;
3262 
3263 sync_failure:
3264 	/*
3265 	 * If blocksize < pagesize, prepare_write() may have instantiated a
3266 	 * few blocks outside i_size.  Trim these off again.
3267 	 */
3268 	kunmap(page);
3269 	UnlockPage(page);
3270 	page_cache_release(page);
3271 	if (pos + bytes > inode->i_size)
3272 		vmtruncate(inode, inode->i_size);
3273 	goto done;
3274 }
3275 
3276 ssize_t
do_generic_direct_write(struct file * file,const char * buf,size_t count,loff_t * ppos)3277 do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3278 {
3279 	struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3280 	struct inode	*inode = mapping->host;
3281 	loff_t		pos;
3282 	ssize_t		written;
3283 	long		status = 0;
3284 	ssize_t		err;
3285 
3286 	pos = *ppos;
3287 	written = 0;
3288 
3289 	err = precheck_file_write(file, inode, &count, &pos);
3290 	if (err != 0 || count == 0)
3291 		goto out;
3292 
3293 	if (!(file->f_flags & O_DIRECT))
3294 		BUG();
3295 
3296 	remove_suid(inode);
3297 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3298 	mark_inode_dirty_sync(inode);
3299 
3300 	written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
3301 	if (written > 0) {
3302 		loff_t end = pos + written;
3303 		if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
3304 			inode->i_size = end;
3305 			mark_inode_dirty(inode);
3306 		}
3307 		*ppos = end;
3308 		invalidate_inode_pages2(mapping);
3309 	}
3310 	/*
3311 	 * Sync the fs metadata but not the minor inode changes and
3312 	 * of course not the data as we did direct DMA for the IO.
3313 	 */
3314 	if (written >= 0 && (file->f_flags & O_SYNC))
3315 		status = generic_osync_inode(inode, OSYNC_METADATA);
3316 
3317 	err = written ? written : status;
3318 out:
3319 	return err;
3320 }
3321 
do_odirect_fallback(struct file * file,struct inode * inode,const char * buf,size_t count,loff_t * ppos)3322 static int do_odirect_fallback(struct file *file, struct inode *inode,
3323 			       const char *buf, size_t count, loff_t *ppos)
3324 {
3325 	ssize_t ret;
3326 	int err;
3327 
3328 	down(&inode->i_sem);
3329 	ret = do_generic_file_write(file, buf, count, ppos);
3330 	if (ret > 0) {
3331 		err = do_fdatasync(file);
3332 		if (err)
3333 			ret = err;
3334 	}
3335 	up(&inode->i_sem);
3336 	return ret;
3337 }
3338 
3339 ssize_t
generic_file_write(struct file * file,const char * buf,size_t count,loff_t * ppos)3340 generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3341 {
3342 	struct inode	*inode = file->f_dentry->d_inode->i_mapping->host;
3343 	ssize_t		err;
3344 
3345 	if ((ssize_t) count < 0)
3346 		return -EINVAL;
3347 
3348 	if (!access_ok(VERIFY_READ, buf, count))
3349 		return -EFAULT;
3350 
3351 	if (file->f_flags & O_DIRECT) {
3352 		/* do_generic_direct_write may drop i_sem during the
3353 		   actual IO */
3354 		down_read(&inode->i_alloc_sem);
3355 		down(&inode->i_sem);
3356 		err = do_generic_direct_write(file, buf, count, ppos);
3357 		up(&inode->i_sem);
3358 		up_read(&inode->i_alloc_sem);
3359 		if (unlikely(err == -ENOTBLK))
3360 			err = do_odirect_fallback(file, inode, buf, count, ppos);
3361 	} else {
3362 		down(&inode->i_sem);
3363 		err = do_generic_file_write(file, buf, count, ppos);
3364 		up(&inode->i_sem);
3365 	}
3366 
3367 	return err;
3368 }
3369 
page_cache_init(unsigned long mempages)3370 void __init page_cache_init(unsigned long mempages)
3371 {
3372 	unsigned long htable_size, order;
3373 
3374 	htable_size = mempages;
3375 	htable_size *= sizeof(struct page *);
3376 	for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
3377 		;
3378 
3379 	do {
3380 		unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
3381 
3382 		page_hash_bits = 0;
3383 		while((tmp >>= 1UL) != 0UL)
3384 			page_hash_bits++;
3385 
3386 		page_hash_table = (struct page **)
3387 			__get_free_pages(GFP_ATOMIC, order);
3388 	} while(page_hash_table == NULL && --order > 0);
3389 
3390 	printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
3391 	       (1 << page_hash_bits), order, (PAGE_SIZE << order));
3392 	if (!page_hash_table)
3393 		panic("Failed to allocate page hash table\n");
3394 	memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
3395 }
3396