1 /*
2 * linux/mm/filemap.c
3 *
4 * Copyright (C) 1994-2006 Linus Torvalds
5 */
6
7 /*
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
11 */
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/locks.h>
17 #include <linux/pagemap.h>
18 #include <linux/swap.h>
19 #include <linux/smp_lock.h>
20 #include <linux/blkdev.h>
21 #include <linux/file.h>
22 #include <linux/swapctl.h>
23 #include <linux/init.h>
24 #include <linux/mm.h>
25 #include <linux/iobuf.h>
26
27 #include <asm/pgalloc.h>
28 #include <asm/uaccess.h>
29 #include <asm/mman.h>
30
31 #include <linux/highmem.h>
32
33 /*
34 * Shared mappings implemented 30.11.1994. It's not fully working yet,
35 * though.
36 *
37 * Shared mappings now work. 15.8.1995 Bruno.
38 *
39 * finished 'unifying' the page and buffer cache and SMP-threaded the
40 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
41 *
42 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
43 */
44
45 unsigned long page_cache_size;
46 unsigned int page_hash_bits;
47 struct page **page_hash_table;
48
49 int vm_max_readahead = 31;
50 int vm_min_readahead = 3;
51 EXPORT_SYMBOL(vm_max_readahead);
52 EXPORT_SYMBOL(vm_min_readahead);
53
54
55 spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED};
56 /*
57 * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
58 * with the pagecache_lock held.
59 *
60 * Ordering:
61 * swap_lock ->
62 * pagemap_lru_lock ->
63 * pagecache_lock
64 */
65 spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
66
67 #define CLUSTER_PAGES (1 << page_cluster)
68 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
69
70 static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
add_page_to_hash_queue(struct page * page,struct page ** p)71 static void fastcall add_page_to_hash_queue(struct page * page, struct page **p)
72 {
73 struct page *next = *p;
74
75 *p = page;
76 page->next_hash = next;
77 page->pprev_hash = p;
78 if (next)
79 next->pprev_hash = &page->next_hash;
80 if (page->buffers)
81 PAGE_BUG(page);
82 inc_nr_cache_pages(page);
83 }
84
add_page_to_inode_queue(struct address_space * mapping,struct page * page)85 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
86 {
87 struct list_head *head = &mapping->clean_pages;
88
89 mapping->nrpages++;
90 list_add(&page->list, head);
91 page->mapping = mapping;
92 }
93
remove_page_from_inode_queue(struct page * page)94 static inline void remove_page_from_inode_queue(struct page * page)
95 {
96 struct address_space * mapping = page->mapping;
97
98 if (mapping->a_ops->removepage)
99 mapping->a_ops->removepage(page);
100
101 list_del(&page->list);
102 page->mapping = NULL;
103 wmb();
104 mapping->nrpages--;
105 if (!mapping->nrpages)
106 refile_inode(mapping->host);
107 }
108
remove_page_from_hash_queue(struct page * page)109 static inline void remove_page_from_hash_queue(struct page * page)
110 {
111 struct page *next = page->next_hash;
112 struct page **pprev = page->pprev_hash;
113
114 if (next)
115 next->pprev_hash = pprev;
116 *pprev = next;
117 page->pprev_hash = NULL;
118 dec_nr_cache_pages(page);
119 }
120
121 /*
122 * Remove a page from the page cache and free it. Caller has to make
123 * sure the page is locked and that nobody else uses it - or that usage
124 * is safe.
125 */
__remove_inode_page(struct page * page)126 void __remove_inode_page(struct page *page)
127 {
128 remove_page_from_inode_queue(page);
129 remove_page_from_hash_queue(page);
130 }
131
remove_inode_page(struct page * page)132 void remove_inode_page(struct page *page)
133 {
134 if (!PageLocked(page))
135 PAGE_BUG(page);
136
137 spin_lock(&pagecache_lock);
138 __remove_inode_page(page);
139 spin_unlock(&pagecache_lock);
140 }
141
sync_page(struct page * page)142 static inline int sync_page(struct page *page)
143 {
144 struct address_space *mapping = page->mapping;
145
146 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
147 return mapping->a_ops->sync_page(page);
148 return 0;
149 }
150
151 /*
152 * Add a page to the dirty page list.
153 */
set_page_dirty(struct page * page)154 void fastcall set_page_dirty(struct page *page)
155 {
156 if (!test_and_set_bit(PG_dirty, &page->flags)) {
157 struct address_space *mapping = page->mapping;
158
159 if (mapping) {
160 spin_lock(&pagecache_lock);
161 mapping = page->mapping;
162 if (mapping) { /* may have been truncated */
163 list_del(&page->list);
164 list_add(&page->list, &mapping->dirty_pages);
165 }
166 spin_unlock(&pagecache_lock);
167
168 if (mapping && mapping->host)
169 mark_inode_dirty_pages(mapping->host);
170 if (block_dump)
171 printk(KERN_DEBUG "%s: dirtied page\n", current->comm);
172 }
173 }
174 }
175
176 /**
177 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
178 * @inode: the inode which pages we want to invalidate
179 *
180 * This function only removes the unlocked pages, if you want to
181 * remove all the pages of one inode, you must call truncate_inode_pages.
182 */
183
invalidate_inode_pages(struct inode * inode)184 void invalidate_inode_pages(struct inode * inode)
185 {
186 struct list_head *head, *curr;
187 struct page * page;
188
189 head = &inode->i_mapping->clean_pages;
190
191 spin_lock(&pagemap_lru_lock);
192 spin_lock(&pagecache_lock);
193 curr = head->next;
194
195 while (curr != head) {
196 page = list_entry(curr, struct page, list);
197 curr = curr->next;
198
199 /* We cannot invalidate something in dirty.. */
200 if (PageDirty(page))
201 continue;
202
203 /* ..or locked */
204 if (TryLockPage(page))
205 continue;
206
207 if (page->buffers && !try_to_free_buffers(page, 0))
208 goto unlock;
209
210 if (page_count(page) != 1)
211 goto unlock;
212
213 __lru_cache_del(page);
214 __remove_inode_page(page);
215 UnlockPage(page);
216 page_cache_release(page);
217 continue;
218 unlock:
219 UnlockPage(page);
220 continue;
221 }
222
223 spin_unlock(&pagecache_lock);
224 spin_unlock(&pagemap_lru_lock);
225 }
226
do_flushpage(struct page * page,unsigned long offset)227 static int do_flushpage(struct page *page, unsigned long offset)
228 {
229 int (*flushpage) (struct page *, unsigned long);
230 flushpage = page->mapping->a_ops->flushpage;
231 if (flushpage)
232 return (*flushpage)(page, offset);
233 return block_flushpage(page, offset);
234 }
235
truncate_partial_page(struct page * page,unsigned partial)236 static inline void truncate_partial_page(struct page *page, unsigned partial)
237 {
238 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
239 if (page->buffers)
240 do_flushpage(page, partial);
241 }
242
truncate_complete_page(struct page * page)243 static void truncate_complete_page(struct page *page)
244 {
245 /* Leave it on the LRU if it gets converted into anonymous buffers */
246 if (!page->buffers || do_flushpage(page, 0))
247 lru_cache_del(page);
248
249 /*
250 * We remove the page from the page cache _after_ we have
251 * destroyed all buffer-cache references to it. Otherwise some
252 * other process might think this inode page is not in the
253 * page cache and creates a buffer-cache alias to it causing
254 * all sorts of fun problems ...
255 */
256 ClearPageDirty(page);
257 ClearPageUptodate(page);
258 remove_inode_page(page);
259 page_cache_release(page);
260 }
261
262 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
truncate_list_pages(struct list_head * head,unsigned long start,unsigned * partial)263 static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
264 {
265 struct list_head *curr;
266 struct page * page;
267 int unlocked = 0;
268
269 restart:
270 curr = head->prev;
271 while (curr != head) {
272 unsigned long offset;
273
274 page = list_entry(curr, struct page, list);
275 offset = page->index;
276
277 /* Is one of the pages to truncate? */
278 if ((offset >= start) || (*partial && (offset + 1) == start)) {
279 int failed;
280
281 page_cache_get(page);
282 failed = TryLockPage(page);
283
284 list_del(head);
285 if (!failed)
286 /* Restart after this page */
287 list_add_tail(head, curr);
288 else
289 /* Restart on this page */
290 list_add(head, curr);
291
292 spin_unlock(&pagecache_lock);
293 unlocked = 1;
294
295 if (!failed) {
296 if (*partial && (offset + 1) == start) {
297 truncate_partial_page(page, *partial);
298 *partial = 0;
299 } else
300 truncate_complete_page(page);
301
302 UnlockPage(page);
303 } else
304 wait_on_page(page);
305
306 page_cache_release(page);
307
308 if (current->need_resched) {
309 __set_current_state(TASK_RUNNING);
310 schedule();
311 }
312
313 spin_lock(&pagecache_lock);
314 goto restart;
315 }
316 curr = curr->prev;
317 }
318 return unlocked;
319 }
320
321
322 /**
323 * truncate_inode_pages - truncate *all* the pages from an offset
324 * @mapping: mapping to truncate
325 * @lstart: offset from with to truncate
326 *
327 * Truncate the page cache at a set offset, removing the pages
328 * that are beyond that offset (and zeroing out partial pages).
329 * If any page is locked we wait for it to become unlocked.
330 */
truncate_inode_pages(struct address_space * mapping,loff_t lstart)331 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
332 {
333 unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
334 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
335 int unlocked;
336
337 spin_lock(&pagecache_lock);
338 do {
339 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
340 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
341 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
342 } while (unlocked);
343 /* Traversed all three lists without dropping the lock */
344 spin_unlock(&pagecache_lock);
345 }
346
invalidate_this_page2(struct page * page,struct list_head * curr,struct list_head * head)347 static inline int invalidate_this_page2(struct page * page,
348 struct list_head * curr,
349 struct list_head * head)
350 {
351 int unlocked = 1;
352
353 /*
354 * The page is locked and we hold the pagecache_lock as well
355 * so both page_count(page) and page->buffers stays constant here.
356 */
357 if (page_count(page) == 1 + !!page->buffers) {
358 /* Restart after this page */
359 list_del(head);
360 list_add_tail(head, curr);
361
362 page_cache_get(page);
363 spin_unlock(&pagecache_lock);
364 truncate_complete_page(page);
365 } else {
366 if (page->buffers) {
367 /* Restart after this page */
368 list_del(head);
369 list_add_tail(head, curr);
370
371 page_cache_get(page);
372 spin_unlock(&pagecache_lock);
373 block_invalidate_page(page);
374 } else
375 unlocked = 0;
376
377 ClearPageDirty(page);
378 ClearPageUptodate(page);
379 }
380
381 return unlocked;
382 }
383
384 static int FASTCALL(invalidate_list_pages2(struct list_head *));
invalidate_list_pages2(struct list_head * head)385 static int fastcall invalidate_list_pages2(struct list_head *head)
386 {
387 struct list_head *curr;
388 struct page * page;
389 int unlocked = 0;
390
391 restart:
392 curr = head->prev;
393 while (curr != head) {
394 page = list_entry(curr, struct page, list);
395
396 if (!TryLockPage(page)) {
397 int __unlocked;
398
399 __unlocked = invalidate_this_page2(page, curr, head);
400 UnlockPage(page);
401 unlocked |= __unlocked;
402 if (!__unlocked) {
403 curr = curr->prev;
404 continue;
405 }
406 } else {
407 /* Restart on this page */
408 list_del(head);
409 list_add(head, curr);
410
411 page_cache_get(page);
412 spin_unlock(&pagecache_lock);
413 unlocked = 1;
414 wait_on_page(page);
415 }
416
417 page_cache_release(page);
418 if (current->need_resched) {
419 __set_current_state(TASK_RUNNING);
420 schedule();
421 }
422
423 spin_lock(&pagecache_lock);
424 goto restart;
425 }
426 return unlocked;
427 }
428
429 /**
430 * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
431 * free the pages because they're mapped.
432 * @mapping: the address_space which pages we want to invalidate
433 */
invalidate_inode_pages2(struct address_space * mapping)434 void invalidate_inode_pages2(struct address_space * mapping)
435 {
436 int unlocked;
437
438 spin_lock(&pagecache_lock);
439 do {
440 unlocked = invalidate_list_pages2(&mapping->clean_pages);
441 unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
442 unlocked |= invalidate_list_pages2(&mapping->locked_pages);
443 } while (unlocked);
444 spin_unlock(&pagecache_lock);
445 }
446
__find_page_nolock(struct address_space * mapping,unsigned long offset,struct page * page)447 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
448 {
449 goto inside;
450
451 for (;;) {
452 page = page->next_hash;
453 inside:
454 if (!page)
455 goto not_found;
456 if (page->mapping != mapping)
457 continue;
458 if (page->index == offset)
459 break;
460 }
461
462 not_found:
463 return page;
464 }
465
do_buffer_fdatasync(struct list_head * head,unsigned long start,unsigned long end,int (* fn)(struct page *))466 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
467 {
468 struct list_head *curr;
469 struct page *page;
470 int retval = 0;
471
472 spin_lock(&pagecache_lock);
473 curr = head->next;
474 while (curr != head) {
475 page = list_entry(curr, struct page, list);
476 curr = curr->next;
477 if (!page->buffers)
478 continue;
479 if (page->index >= end)
480 continue;
481 if (page->index < start)
482 continue;
483
484 page_cache_get(page);
485 spin_unlock(&pagecache_lock);
486 lock_page(page);
487
488 /* The buffers could have been free'd while we waited for the page lock */
489 if (page->buffers)
490 retval |= fn(page);
491
492 UnlockPage(page);
493 spin_lock(&pagecache_lock);
494 curr = page->list.next;
495 page_cache_release(page);
496 }
497 spin_unlock(&pagecache_lock);
498
499 return retval;
500 }
501
502 /*
503 * Two-stage data sync: first start the IO, then go back and
504 * collect the information..
505 */
generic_buffer_fdatasync(struct inode * inode,unsigned long start_idx,unsigned long end_idx)506 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
507 {
508 int retval;
509
510 /* writeout dirty buffers on pages from both clean and dirty lists */
511 retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
512 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
513 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
514
515 /* now wait for locked buffers on pages from both clean and dirty lists */
516 retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
517 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
518 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
519
520 return retval;
521 }
522
523 /*
524 * In-memory filesystems have to fail their
525 * writepage function - and this has to be
526 * worked around in the VM layer..
527 *
528 * We
529 * - mark the page dirty again (but do NOT
530 * add it back to the inode dirty list, as
531 * that would livelock in fdatasync)
532 * - activate the page so that the page stealer
533 * doesn't try to write it out over and over
534 * again.
535 */
fail_writepage(struct page * page)536 int fail_writepage(struct page *page)
537 {
538 /* Only activate on memory-pressure, not fsync.. */
539 if (PageLaunder(page)) {
540 activate_page(page);
541 SetPageReferenced(page);
542 }
543
544 /* Set the page dirty again, unlock */
545 SetPageDirty(page);
546 UnlockPage(page);
547 return 0;
548 }
549
550 EXPORT_SYMBOL(fail_writepage);
551
552 /**
553 * filemap_fdatawrite - walk the list of dirty pages of the given address space
554 * and writepage() each unlocked page (does not wait on locked pages).
555 *
556 * @mapping: address space structure to write
557 *
558 */
filemap_fdatawrite(struct address_space * mapping)559 int filemap_fdatawrite(struct address_space * mapping)
560 {
561 int ret = 0;
562 int (*writepage)(struct page *) = mapping->a_ops->writepage;
563
564 spin_lock(&pagecache_lock);
565
566 while (!list_empty(&mapping->dirty_pages)) {
567 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
568
569 list_del(&page->list);
570 list_add(&page->list, &mapping->locked_pages);
571
572 if (!PageDirty(page))
573 continue;
574
575 page_cache_get(page);
576 spin_unlock(&pagecache_lock);
577
578 if (!TryLockPage(page)) {
579 if (PageDirty(page)) {
580 int err;
581 ClearPageDirty(page);
582 err = writepage(page);
583 if (err && !ret)
584 ret = err;
585 } else
586 UnlockPage(page);
587 }
588 page_cache_release(page);
589 spin_lock(&pagecache_lock);
590 }
591 spin_unlock(&pagecache_lock);
592 return ret;
593 }
594
595 /**
596 * filemap_fdatasync - walk the list of dirty pages of the given address space
597 * and writepage() all of them.
598 *
599 * @mapping: address space structure to write
600 *
601 */
filemap_fdatasync(struct address_space * mapping)602 int filemap_fdatasync(struct address_space * mapping)
603 {
604 int ret = 0;
605 int (*writepage)(struct page *) = mapping->a_ops->writepage;
606
607 spin_lock(&pagecache_lock);
608
609 while (!list_empty(&mapping->dirty_pages)) {
610 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
611
612 list_del(&page->list);
613 list_add(&page->list, &mapping->locked_pages);
614
615 if (!PageDirty(page))
616 continue;
617
618 page_cache_get(page);
619 spin_unlock(&pagecache_lock);
620
621 lock_page(page);
622
623 if (PageDirty(page)) {
624 int err;
625 ClearPageDirty(page);
626 err = writepage(page);
627 if (err && !ret)
628 ret = err;
629 } else
630 UnlockPage(page);
631
632 page_cache_release(page);
633 spin_lock(&pagecache_lock);
634 }
635 spin_unlock(&pagecache_lock);
636 return ret;
637 }
638
639 /**
640 * filemap_fdatawait - walk the list of locked pages of the given address space
641 * and wait for all of them.
642 *
643 * @mapping: address space structure to wait for
644 *
645 */
filemap_fdatawait(struct address_space * mapping)646 int filemap_fdatawait(struct address_space * mapping)
647 {
648 int ret = 0;
649
650 spin_lock(&pagecache_lock);
651
652 while (!list_empty(&mapping->locked_pages)) {
653 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
654
655 list_del(&page->list);
656 list_add(&page->list, &mapping->clean_pages);
657
658 if (!PageLocked(page))
659 continue;
660
661 page_cache_get(page);
662 spin_unlock(&pagecache_lock);
663
664 ___wait_on_page(page);
665 if (PageError(page))
666 ret = -EIO;
667
668 page_cache_release(page);
669 spin_lock(&pagecache_lock);
670 }
671 spin_unlock(&pagecache_lock);
672 return ret;
673 }
674
675 /*
676 * Add a page to the inode page cache.
677 *
678 * The caller must have locked the page and
679 * set all the page flags correctly..
680 */
add_to_page_cache_locked(struct page * page,struct address_space * mapping,unsigned long index)681 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
682 {
683 if (!PageLocked(page))
684 BUG();
685
686 page->index = index;
687 page_cache_get(page);
688 spin_lock(&pagecache_lock);
689 add_page_to_inode_queue(mapping, page);
690 add_page_to_hash_queue(page, page_hash(mapping, index));
691 spin_unlock(&pagecache_lock);
692
693 lru_cache_add(page);
694 }
695
696 /*
697 * This adds a page to the page cache, starting out as locked,
698 * owned by us, but unreferenced, not uptodate and with no errors.
699 */
__add_to_page_cache(struct page * page,struct address_space * mapping,unsigned long offset,struct page ** hash)700 static inline void __add_to_page_cache(struct page * page,
701 struct address_space *mapping, unsigned long offset,
702 struct page **hash)
703 {
704 /*
705 * Yes this is inefficient, however it is needed. The problem
706 * is that we could be adding a page to the swap cache while
707 * another CPU is also modifying page->flags, so the updates
708 * really do need to be atomic. -- Rik
709 */
710 ClearPageUptodate(page);
711 ClearPageError(page);
712 ClearPageDirty(page);
713 ClearPageReferenced(page);
714 ClearPageArch1(page);
715 ClearPageChecked(page);
716 LockPage(page);
717 page_cache_get(page);
718 page->index = offset;
719 add_page_to_inode_queue(mapping, page);
720 add_page_to_hash_queue(page, hash);
721 }
722
add_to_page_cache(struct page * page,struct address_space * mapping,unsigned long offset)723 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
724 {
725 spin_lock(&pagecache_lock);
726 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
727 spin_unlock(&pagecache_lock);
728 lru_cache_add(page);
729 }
730
add_to_page_cache_unique(struct page * page,struct address_space * mapping,unsigned long offset,struct page ** hash)731 int add_to_page_cache_unique(struct page * page,
732 struct address_space *mapping, unsigned long offset,
733 struct page **hash)
734 {
735 int err;
736 struct page *alias;
737
738 spin_lock(&pagecache_lock);
739 alias = __find_page_nolock(mapping, offset, *hash);
740
741 err = 1;
742 if (!alias) {
743 __add_to_page_cache(page,mapping,offset,hash);
744 err = 0;
745 }
746
747 spin_unlock(&pagecache_lock);
748 if (!err)
749 lru_cache_add(page);
750 return err;
751 }
752
753 /*
754 * This adds the requested page to the page cache if it isn't already there,
755 * and schedules an I/O to read in its contents from disk.
756 */
757 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
page_cache_read(struct file * file,unsigned long offset)758 static int fastcall page_cache_read(struct file * file, unsigned long offset)
759 {
760 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
761 struct page **hash = page_hash(mapping, offset);
762 struct page *page;
763
764 spin_lock(&pagecache_lock);
765 page = __find_page_nolock(mapping, offset, *hash);
766 spin_unlock(&pagecache_lock);
767 if (page)
768 return 0;
769
770 page = page_cache_alloc(mapping);
771 if (!page)
772 return -ENOMEM;
773
774 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
775 int error = mapping->a_ops->readpage(file, page);
776 page_cache_release(page);
777 return error;
778 }
779 /*
780 * We arrive here in the unlikely event that someone
781 * raced with us and added our page to the cache first.
782 */
783 page_cache_release(page);
784 return 0;
785 }
786
787 /*
788 * Read in an entire cluster at once. A cluster is usually a 64k-
789 * aligned block that includes the page requested in "offset."
790 */
791 static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
792 unsigned long filesize));
read_cluster_nonblocking(struct file * file,unsigned long offset,unsigned long filesize)793 static int fastcall read_cluster_nonblocking(struct file * file, unsigned long offset,
794 unsigned long filesize)
795 {
796 unsigned long pages = CLUSTER_PAGES;
797
798 offset = CLUSTER_OFFSET(offset);
799 while ((pages-- > 0) && (offset < filesize)) {
800 int error = page_cache_read(file, offset);
801 if (error < 0)
802 return error;
803 offset ++;
804 }
805
806 return 0;
807 }
808
809 /*
810 * Knuth recommends primes in approximately golden ratio to the maximum
811 * integer representable by a machine word for multiplicative hashing.
812 * Chuck Lever verified the effectiveness of this technique:
813 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
814 *
815 * These primes are chosen to be bit-sparse, that is operations on
816 * them can use shifts and additions instead of multiplications for
817 * machines where multiplications are slow.
818 */
819 #if BITS_PER_LONG == 32
820 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
821 #define GOLDEN_RATIO_PRIME 0x9e370001UL
822 #elif BITS_PER_LONG == 64
823 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
824 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
825 #else
826 #error Define GOLDEN_RATIO_PRIME for your wordsize.
827 #endif
828
829 /*
830 * In order to wait for pages to become available there must be
831 * waitqueues associated with pages. By using a hash table of
832 * waitqueues where the bucket discipline is to maintain all
833 * waiters on the same queue and wake all when any of the pages
834 * become available, and for the woken contexts to check to be
835 * sure the appropriate page became available, this saves space
836 * at a cost of "thundering herd" phenomena during rare hash
837 * collisions.
838 */
page_waitqueue(struct page * page)839 static inline wait_queue_head_t *page_waitqueue(struct page *page)
840 {
841 const zone_t *zone = page_zone(page);
842 wait_queue_head_t *wait = zone->wait_table;
843 unsigned long hash = (unsigned long)page;
844
845 #if BITS_PER_LONG == 64
846 /* Sigh, gcc can't optimise this alone like it does for 32 bits. */
847 unsigned long n = hash;
848 n <<= 18;
849 hash -= n;
850 n <<= 33;
851 hash -= n;
852 n <<= 3;
853 hash += n;
854 n <<= 3;
855 hash -= n;
856 n <<= 4;
857 hash += n;
858 n <<= 2;
859 hash += n;
860 #else
861 /* On some cpus multiply is faster, on others gcc will do shifts */
862 hash *= GOLDEN_RATIO_PRIME;
863 #endif
864 hash >>= zone->wait_table_shift;
865
866 return &wait[hash];
867 }
868
869 /*
870 * This must be called after every submit_bh with end_io
871 * callbacks that would result into the blkdev layer waking
872 * up the page after a queue unplug.
873 */
wakeup_page_waiters(struct page * page)874 void fastcall wakeup_page_waiters(struct page * page)
875 {
876 wait_queue_head_t * head;
877
878 head = page_waitqueue(page);
879 if (waitqueue_active(head))
880 wake_up(head);
881 }
882
883 /*
884 * Wait for a page to get unlocked.
885 *
886 * This must be called with the caller "holding" the page,
887 * ie with increased "page->count" so that the page won't
888 * go away during the wait..
889 *
890 * The waiting strategy is to get on a waitqueue determined
891 * by hashing. Waiters will then collide, and the newly woken
892 * task must then determine whether it was woken for the page
893 * it really wanted, and go back to sleep on the waitqueue if
894 * that wasn't it. With the waitqueue semantics, it never leaves
895 * the waitqueue unless it calls, so the loop moves forward one
896 * iteration every time there is
897 * (1) a collision
898 * and
899 * (2) one of the colliding pages is woken
900 *
901 * This is the thundering herd problem, but it is expected to
902 * be very rare due to the few pages that are actually being
903 * waited on at any given time and the quality of the hash function.
904 */
___wait_on_page(struct page * page)905 void ___wait_on_page(struct page *page)
906 {
907 wait_queue_head_t *waitqueue = page_waitqueue(page);
908 struct task_struct *tsk = current;
909 DECLARE_WAITQUEUE(wait, tsk);
910
911 add_wait_queue(waitqueue, &wait);
912 do {
913 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
914 if (!PageLocked(page))
915 break;
916 sync_page(page);
917 schedule();
918 } while (PageLocked(page));
919 __set_task_state(tsk, TASK_RUNNING);
920 remove_wait_queue(waitqueue, &wait);
921 }
922
923 /*
924 * unlock_page() is the other half of the story just above
925 * __wait_on_page(). Here a couple of quick checks are done
926 * and a couple of flags are set on the page, and then all
927 * of the waiters for all of the pages in the appropriate
928 * wait queue are woken.
929 */
unlock_page(struct page * page)930 void fastcall unlock_page(struct page *page)
931 {
932 wait_queue_head_t *waitqueue = page_waitqueue(page);
933 ClearPageLaunder(page);
934 smp_mb__before_clear_bit();
935 if (!test_and_clear_bit(PG_locked, &(page)->flags))
936 BUG();
937 smp_mb__after_clear_bit();
938
939 /*
940 * Although the default semantics of wake_up() are
941 * to wake all, here the specific function is used
942 * to make it even more explicit that a number of
943 * pages are being waited on here.
944 */
945 if (waitqueue_active(waitqueue))
946 wake_up_all(waitqueue);
947 }
948
949 /*
950 * Get a lock on the page, assuming we need to sleep
951 * to get it..
952 */
__lock_page(struct page * page)953 static void __lock_page(struct page *page)
954 {
955 wait_queue_head_t *waitqueue = page_waitqueue(page);
956 struct task_struct *tsk = current;
957 DECLARE_WAITQUEUE(wait, tsk);
958
959 add_wait_queue_exclusive(waitqueue, &wait);
960 for (;;) {
961 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
962 if (PageLocked(page)) {
963 sync_page(page);
964 schedule();
965 }
966 if (!TryLockPage(page))
967 break;
968 }
969 __set_task_state(tsk, TASK_RUNNING);
970 remove_wait_queue(waitqueue, &wait);
971 }
972
973 /*
974 * Get an exclusive lock on the page, optimistically
975 * assuming it's not locked..
976 */
lock_page(struct page * page)977 void fastcall lock_page(struct page *page)
978 {
979 if (TryLockPage(page))
980 __lock_page(page);
981 }
982
983 /*
984 * a rather lightweight function, finding and getting a reference to a
985 * hashed page atomically.
986 */
__find_get_page(struct address_space * mapping,unsigned long offset,struct page ** hash)987 struct page * __find_get_page(struct address_space *mapping,
988 unsigned long offset, struct page **hash)
989 {
990 struct page *page;
991
992 /*
993 * We scan the hash list read-only. Addition to and removal from
994 * the hash-list needs a held write-lock.
995 */
996 spin_lock(&pagecache_lock);
997 page = __find_page_nolock(mapping, offset, *hash);
998 if (page)
999 page_cache_get(page);
1000 spin_unlock(&pagecache_lock);
1001 return page;
1002 }
1003
1004 /*
1005 * Same as above, but trylock it instead of incrementing the count.
1006 */
find_trylock_page(struct address_space * mapping,unsigned long offset)1007 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
1008 {
1009 struct page *page;
1010 struct page **hash = page_hash(mapping, offset);
1011
1012 spin_lock(&pagecache_lock);
1013 page = __find_page_nolock(mapping, offset, *hash);
1014 if (page) {
1015 if (TryLockPage(page))
1016 page = NULL;
1017 }
1018 spin_unlock(&pagecache_lock);
1019 return page;
1020 }
1021
1022 /*
1023 * Must be called with the pagecache lock held,
1024 * will return with it held (but it may be dropped
1025 * during blocking operations..
1026 */
1027 static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
__find_lock_page_helper(struct address_space * mapping,unsigned long offset,struct page * hash)1028 static struct page * fastcall __find_lock_page_helper(struct address_space *mapping,
1029 unsigned long offset, struct page *hash)
1030 {
1031 struct page *page;
1032
1033 /*
1034 * We scan the hash list read-only. Addition to and removal from
1035 * the hash-list needs a held write-lock.
1036 */
1037 repeat:
1038 page = __find_page_nolock(mapping, offset, hash);
1039 if (page) {
1040 page_cache_get(page);
1041 if (TryLockPage(page)) {
1042 spin_unlock(&pagecache_lock);
1043 lock_page(page);
1044 spin_lock(&pagecache_lock);
1045
1046 /* Has the page been re-allocated while we slept? */
1047 if (page->mapping != mapping || page->index != offset) {
1048 UnlockPage(page);
1049 page_cache_release(page);
1050 goto repeat;
1051 }
1052 }
1053 }
1054 return page;
1055 }
1056
1057 /*
1058 * Same as the above, but lock the page too, verifying that
1059 * it's still valid once we own it.
1060 */
__find_lock_page(struct address_space * mapping,unsigned long offset,struct page ** hash)1061 struct page * __find_lock_page (struct address_space *mapping,
1062 unsigned long offset, struct page **hash)
1063 {
1064 struct page *page;
1065
1066 spin_lock(&pagecache_lock);
1067 page = __find_lock_page_helper(mapping, offset, *hash);
1068 spin_unlock(&pagecache_lock);
1069 return page;
1070 }
1071
1072 /*
1073 * Same as above, but create the page if required..
1074 */
find_or_create_page(struct address_space * mapping,unsigned long index,unsigned int gfp_mask)1075 struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
1076 {
1077 struct page *page;
1078 struct page **hash = page_hash(mapping, index);
1079
1080 spin_lock(&pagecache_lock);
1081 page = __find_lock_page_helper(mapping, index, *hash);
1082 spin_unlock(&pagecache_lock);
1083 if (!page) {
1084 struct page *newpage = alloc_page(gfp_mask);
1085 if (newpage) {
1086 spin_lock(&pagecache_lock);
1087 page = __find_lock_page_helper(mapping, index, *hash);
1088 if (likely(!page)) {
1089 page = newpage;
1090 __add_to_page_cache(page, mapping, index, hash);
1091 newpage = NULL;
1092 }
1093 spin_unlock(&pagecache_lock);
1094 if (newpage == NULL)
1095 lru_cache_add(page);
1096 else
1097 page_cache_release(newpage);
1098 }
1099 }
1100 return page;
1101 }
1102
1103 /*
1104 * Same as grab_cache_page, but do not wait if the page is unavailable.
1105 * This is intended for speculative data generators, where the data can
1106 * be regenerated if the page couldn't be grabbed. This routine should
1107 * be safe to call while holding the lock for another page.
1108 */
grab_cache_page_nowait(struct address_space * mapping,unsigned long index)1109 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
1110 {
1111 struct page *page, **hash;
1112
1113 hash = page_hash(mapping, index);
1114 page = __find_get_page(mapping, index, hash);
1115
1116 if ( page ) {
1117 if ( !TryLockPage(page) ) {
1118 /* Page found and locked */
1119 /* This test is overly paranoid, but what the heck... */
1120 if ( unlikely(page->mapping != mapping || page->index != index) ) {
1121 /* Someone reallocated this page under us. */
1122 UnlockPage(page);
1123 page_cache_release(page);
1124 return NULL;
1125 } else {
1126 return page;
1127 }
1128 } else {
1129 /* Page locked by someone else */
1130 page_cache_release(page);
1131 return NULL;
1132 }
1133 }
1134
1135 page = page_cache_alloc(mapping);
1136 if ( unlikely(!page) )
1137 return NULL; /* Failed to allocate a page */
1138
1139 if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
1140 /* Someone else grabbed the page already. */
1141 page_cache_release(page);
1142 return NULL;
1143 }
1144
1145 return page;
1146 }
1147
1148 #if 0
1149 #define PROFILE_READAHEAD
1150 #define DEBUG_READAHEAD
1151 #endif
1152
1153 /*
1154 * Read-ahead profiling information
1155 * --------------------------------
1156 * Every PROFILE_MAXREADCOUNT, the following information is written
1157 * to the syslog:
1158 * Percentage of asynchronous read-ahead.
1159 * Average of read-ahead fields context value.
1160 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
1161 * to the syslog.
1162 */
1163
1164 #ifdef PROFILE_READAHEAD
1165
1166 #define PROFILE_MAXREADCOUNT 1000
1167
1168 static unsigned long total_reada;
1169 static unsigned long total_async;
1170 static unsigned long total_ramax;
1171 static unsigned long total_ralen;
1172 static unsigned long total_rawin;
1173
profile_readahead(int async,struct file * filp)1174 static void profile_readahead(int async, struct file *filp)
1175 {
1176 unsigned long flags;
1177
1178 ++total_reada;
1179 if (async)
1180 ++total_async;
1181
1182 total_ramax += filp->f_ramax;
1183 total_ralen += filp->f_ralen;
1184 total_rawin += filp->f_rawin;
1185
1186 if (total_reada > PROFILE_MAXREADCOUNT) {
1187 save_flags(flags);
1188 cli();
1189 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
1190 restore_flags(flags);
1191 return;
1192 }
1193
1194 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
1195 total_ramax/total_reada,
1196 total_ralen/total_reada,
1197 total_rawin/total_reada,
1198 (total_async*100)/total_reada);
1199 #ifdef DEBUG_READAHEAD
1200 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
1201 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
1202 #endif
1203
1204 total_reada = 0;
1205 total_async = 0;
1206 total_ramax = 0;
1207 total_ralen = 0;
1208 total_rawin = 0;
1209
1210 restore_flags(flags);
1211 }
1212 }
1213 #endif /* defined PROFILE_READAHEAD */
1214
1215 /*
1216 * Read-ahead context:
1217 * -------------------
1218 * The read ahead context fields of the "struct file" are the following:
1219 * - f_raend : position of the first byte after the last page we tried to
1220 * read ahead.
1221 * - f_ramax : current read-ahead maximum size.
1222 * - f_ralen : length of the current IO read block we tried to read-ahead.
1223 * - f_rawin : length of the current read-ahead window.
1224 * if last read-ahead was synchronous then
1225 * f_rawin = f_ralen
1226 * otherwise (was asynchronous)
1227 * f_rawin = previous value of f_ralen + f_ralen
1228 *
1229 * Read-ahead limits:
1230 * ------------------
1231 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
1232 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
1233 *
1234 * Synchronous read-ahead benefits:
1235 * --------------------------------
1236 * Using reasonable IO xfer length from peripheral devices increase system
1237 * performances.
1238 * Reasonable means, in this context, not too large but not too small.
1239 * The actual maximum value is:
1240 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
1241 * and 32K if defined (4K page size assumed).
1242 *
1243 * Asynchronous read-ahead benefits:
1244 * ---------------------------------
1245 * Overlapping next read request and user process execution increase system
1246 * performance.
1247 *
1248 * Read-ahead risks:
1249 * -----------------
1250 * We have to guess which further data are needed by the user process.
1251 * If these data are often not really needed, it's bad for system
1252 * performances.
1253 * However, we know that files are often accessed sequentially by
1254 * application programs and it seems that it is possible to have some good
1255 * strategy in that guessing.
1256 * We only try to read-ahead files that seems to be read sequentially.
1257 *
1258 * Asynchronous read-ahead risks:
1259 * ------------------------------
1260 * In order to maximize overlapping, we must start some asynchronous read
1261 * request from the device, as soon as possible.
1262 * We must be very careful about:
1263 * - The number of effective pending IO read requests.
1264 * ONE seems to be the only reasonable value.
1265 * - The total memory pool usage for the file access stream.
1266 * This maximum memory usage is implicitly 2 IO read chunks:
1267 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
1268 * 64k if defined (4K page size assumed).
1269 */
1270
get_max_readahead(struct inode * inode)1271 static inline int get_max_readahead(struct inode * inode)
1272 {
1273 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
1274 return vm_max_readahead;
1275 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
1276 }
1277
generic_file_readahead(int reada_ok,struct file * filp,struct inode * inode,struct page * page)1278 static void generic_file_readahead(int reada_ok,
1279 struct file * filp, struct inode * inode,
1280 struct page * page)
1281 {
1282 unsigned long end_index;
1283 unsigned long index = page->index;
1284 unsigned long max_ahead, ahead;
1285 unsigned long raend;
1286 int max_readahead = get_max_readahead(inode);
1287
1288 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1289
1290 raend = filp->f_raend;
1291 max_ahead = 0;
1292
1293 /*
1294 * The current page is locked.
1295 * If the current position is inside the previous read IO request, do not
1296 * try to reread previously read ahead pages.
1297 * Otherwise decide or not to read ahead some pages synchronously.
1298 * If we are not going to read ahead, set the read ahead context for this
1299 * page only.
1300 */
1301 if (PageLocked(page)) {
1302 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1303 raend = index;
1304 if (raend < end_index)
1305 max_ahead = filp->f_ramax;
1306 filp->f_rawin = 0;
1307 filp->f_ralen = 1;
1308 if (!max_ahead) {
1309 filp->f_raend = index + filp->f_ralen;
1310 filp->f_rawin += filp->f_ralen;
1311 }
1312 }
1313 }
1314 /*
1315 * The current page is not locked.
1316 * If we were reading ahead and,
1317 * if the current max read ahead size is not zero and,
1318 * if the current position is inside the last read-ahead IO request,
1319 * it is the moment to try to read ahead asynchronously.
1320 * We will later force unplug device in order to force asynchronous read IO.
1321 */
1322 else if (reada_ok && filp->f_ramax && raend >= 1 &&
1323 index <= raend && index + filp->f_ralen >= raend) {
1324 /*
1325 * Add ONE page to max_ahead in order to try to have about the same IO max size
1326 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1327 * Compute the position of the last page we have tried to read in order to
1328 * begin to read ahead just at the next page.
1329 */
1330 raend -= 1;
1331 if (raend < end_index)
1332 max_ahead = filp->f_ramax + 1;
1333
1334 if (max_ahead) {
1335 filp->f_rawin = filp->f_ralen;
1336 filp->f_ralen = 0;
1337 reada_ok = 2;
1338 }
1339 }
1340 /*
1341 * Try to read ahead pages.
1342 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1343 * scheduler, will work enough for us to avoid too bad actuals IO requests.
1344 */
1345 ahead = 0;
1346 while (ahead < max_ahead) {
1347 unsigned long ra_index = raend + ahead + 1;
1348
1349 if (ra_index >= end_index)
1350 break;
1351 if (page_cache_read(filp, ra_index) < 0)
1352 break;
1353
1354 ahead++;
1355 }
1356 /*
1357 * If we tried to read ahead some pages,
1358 * If we tried to read ahead asynchronously,
1359 * Try to force unplug of the device in order to start an asynchronous
1360 * read IO request.
1361 * Update the read-ahead context.
1362 * Store the length of the current read-ahead window.
1363 * Double the current max read ahead size.
1364 * That heuristic avoid to do some large IO for files that are not really
1365 * accessed sequentially.
1366 */
1367 if (ahead) {
1368 filp->f_ralen += ahead;
1369 filp->f_rawin += filp->f_ralen;
1370 filp->f_raend = raend + ahead + 1;
1371
1372 filp->f_ramax += filp->f_ramax;
1373
1374 if (filp->f_ramax > max_readahead)
1375 filp->f_ramax = max_readahead;
1376
1377 #ifdef PROFILE_READAHEAD
1378 profile_readahead((reada_ok == 2), filp);
1379 #endif
1380 }
1381
1382 return;
1383 }
1384
1385 /*
1386 * Mark a page as having seen activity.
1387 *
1388 * If it was already so marked, move it to the active queue and drop
1389 * the referenced bit. Otherwise, just mark it for future action..
1390 */
mark_page_accessed(struct page * page)1391 void fastcall mark_page_accessed(struct page *page)
1392 {
1393 if (!PageActive(page) && PageReferenced(page)) {
1394 activate_page(page);
1395 ClearPageReferenced(page);
1396 } else
1397 SetPageReferenced(page);
1398 }
1399
1400 /*
1401 * This is a generic file read routine, and uses the
1402 * inode->i_op->readpage() function for the actual low-level
1403 * stuff.
1404 *
1405 * This is really ugly. But the goto's actually try to clarify some
1406 * of the logic when it comes to error handling etc.
1407 */
do_generic_file_read(struct file * filp,loff_t * ppos,read_descriptor_t * desc,read_actor_t actor)1408 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1409 {
1410 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1411 struct inode *inode = mapping->host;
1412 unsigned long index, offset;
1413 struct page *cached_page;
1414 int reada_ok;
1415 int error;
1416 int max_readahead = get_max_readahead(inode);
1417
1418 cached_page = NULL;
1419 index = *ppos >> PAGE_CACHE_SHIFT;
1420 offset = *ppos & ~PAGE_CACHE_MASK;
1421
1422 /*
1423 * If the current position is outside the previous read-ahead window,
1424 * we reset the current read-ahead context and set read ahead max to zero
1425 * (will be set to just needed value later),
1426 * otherwise, we assume that the file accesses are sequential enough to
1427 * continue read-ahead.
1428 */
1429 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1430 reada_ok = 0;
1431 filp->f_raend = 0;
1432 filp->f_ralen = 0;
1433 filp->f_ramax = 0;
1434 filp->f_rawin = 0;
1435 } else {
1436 reada_ok = 1;
1437 }
1438 /*
1439 * Adjust the current value of read-ahead max.
1440 * If the read operation stay in the first half page, force no readahead.
1441 * Otherwise try to increase read ahead max just enough to do the read request.
1442 * Then, at least MIN_READAHEAD if read ahead is ok,
1443 * and at most MAX_READAHEAD in all cases.
1444 */
1445 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1446 filp->f_ramax = 0;
1447 } else {
1448 unsigned long needed;
1449
1450 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1451
1452 if (filp->f_ramax < needed)
1453 filp->f_ramax = needed;
1454
1455 if (reada_ok && filp->f_ramax < vm_min_readahead)
1456 filp->f_ramax = vm_min_readahead;
1457 if (filp->f_ramax > max_readahead)
1458 filp->f_ramax = max_readahead;
1459 }
1460
1461 for (;;) {
1462 struct page *page, **hash;
1463 unsigned long end_index, nr, ret;
1464
1465 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1466
1467 if (index > end_index)
1468 break;
1469 nr = PAGE_CACHE_SIZE;
1470 if (index == end_index) {
1471 nr = inode->i_size & ~PAGE_CACHE_MASK;
1472 if (nr <= offset)
1473 break;
1474 }
1475
1476 nr = nr - offset;
1477
1478 /*
1479 * Try to find the data in the page cache..
1480 */
1481 hash = page_hash(mapping, index);
1482
1483 spin_lock(&pagecache_lock);
1484 page = __find_page_nolock(mapping, index, *hash);
1485 if (!page)
1486 goto no_cached_page;
1487 found_page:
1488 page_cache_get(page);
1489 spin_unlock(&pagecache_lock);
1490
1491 if (!Page_Uptodate(page))
1492 goto page_not_up_to_date;
1493 generic_file_readahead(reada_ok, filp, inode, page);
1494 page_ok:
1495 /* If users can be writing to this page using arbitrary
1496 * virtual addresses, take care about potential aliasing
1497 * before reading the page on the kernel side.
1498 */
1499 if (mapping->i_mmap_shared != NULL)
1500 flush_dcache_page(page);
1501
1502 /*
1503 * Mark the page accessed if we read the
1504 * beginning or we just did an lseek.
1505 */
1506 if (!offset || !filp->f_reada)
1507 mark_page_accessed(page);
1508
1509 /*
1510 * Ok, we have the page, and it's up-to-date, so
1511 * now we can copy it to user space...
1512 *
1513 * The actor routine returns how many bytes were actually used..
1514 * NOTE! This may not be the same as how much of a user buffer
1515 * we filled up (we may be padding etc), so we can only update
1516 * "pos" here (the actor routine has to update the user buffer
1517 * pointers and the remaining count).
1518 */
1519 ret = actor(desc, page, offset, nr);
1520 offset += ret;
1521 index += offset >> PAGE_CACHE_SHIFT;
1522 offset &= ~PAGE_CACHE_MASK;
1523
1524 page_cache_release(page);
1525 if (ret == nr && desc->count)
1526 continue;
1527 break;
1528
1529 /*
1530 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1531 */
1532 page_not_up_to_date:
1533 generic_file_readahead(reada_ok, filp, inode, page);
1534
1535 if (Page_Uptodate(page))
1536 goto page_ok;
1537
1538 /* Get exclusive access to the page ... */
1539 lock_page(page);
1540
1541 /* Did it get unhashed before we got the lock? */
1542 if (!page->mapping) {
1543 UnlockPage(page);
1544 page_cache_release(page);
1545 continue;
1546 }
1547
1548 /* Did somebody else fill it already? */
1549 if (Page_Uptodate(page)) {
1550 UnlockPage(page);
1551 goto page_ok;
1552 }
1553
1554 readpage:
1555 /* ... and start the actual read. The read will unlock the page. */
1556 error = mapping->a_ops->readpage(filp, page);
1557
1558 if (!error) {
1559 if (Page_Uptodate(page))
1560 goto page_ok;
1561
1562 /* Again, try some read-ahead while waiting for the page to finish.. */
1563 generic_file_readahead(reada_ok, filp, inode, page);
1564 wait_on_page(page);
1565 if (Page_Uptodate(page))
1566 goto page_ok;
1567 error = -EIO;
1568 }
1569
1570 /* UHHUH! A synchronous read error occurred. Report it */
1571 desc->error = error;
1572 page_cache_release(page);
1573 break;
1574
1575 no_cached_page:
1576 /*
1577 * Ok, it wasn't cached, so we need to create a new
1578 * page..
1579 *
1580 * We get here with the page cache lock held.
1581 */
1582 if (!cached_page) {
1583 spin_unlock(&pagecache_lock);
1584 cached_page = page_cache_alloc(mapping);
1585 if (!cached_page) {
1586 desc->error = -ENOMEM;
1587 break;
1588 }
1589
1590 /*
1591 * Somebody may have added the page while we
1592 * dropped the page cache lock. Check for that.
1593 */
1594 spin_lock(&pagecache_lock);
1595 page = __find_page_nolock(mapping, index, *hash);
1596 if (page)
1597 goto found_page;
1598 }
1599
1600 /*
1601 * Ok, add the new page to the hash-queues...
1602 */
1603 page = cached_page;
1604 __add_to_page_cache(page, mapping, index, hash);
1605 spin_unlock(&pagecache_lock);
1606 lru_cache_add(page);
1607 cached_page = NULL;
1608
1609 goto readpage;
1610 }
1611
1612 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1613 filp->f_reada = 1;
1614 if (cached_page)
1615 page_cache_release(cached_page);
1616 UPDATE_ATIME(inode);
1617 }
1618
have_mapping_directIO(struct address_space * mapping)1619 static inline int have_mapping_directIO(struct address_space * mapping)
1620 {
1621 return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
1622 }
1623
1624 /* Switch between old and new directIO formats */
do_call_directIO(int rw,struct file * filp,struct kiobuf * iobuf,unsigned long offset,int blocksize)1625 static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
1626 {
1627 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1628
1629 if (mapping->a_ops->direct_fileIO)
1630 return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
1631 return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
1632 }
1633
1634 /*
1635 * i_sem and i_alloc_sem should be held already. i_sem may be dropped
1636 * later once we've mapped the new IO. i_alloc_sem is kept until the IO
1637 * completes.
1638 */
1639
generic_file_direct_IO(int rw,struct file * filp,char * buf,size_t count,loff_t offset)1640 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1641 {
1642 ssize_t retval, progress;
1643 int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits;
1644 ssize_t iosize;
1645 struct kiobuf * iobuf;
1646 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1647 struct inode * inode = mapping->host;
1648 loff_t size = inode->i_size;
1649
1650 new_iobuf = 0;
1651 iobuf = filp->f_iobuf;
1652 if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1653 /*
1654 * A parallel read/write is using the preallocated iobuf
1655 * so just run slow and allocate a new one.
1656 */
1657 retval = alloc_kiovec(1, &iobuf);
1658 if (retval)
1659 goto out;
1660 new_iobuf = 1;
1661 }
1662
1663 blocksize = 1 << inode->i_blkbits;
1664 blocksize_bits = inode->i_blkbits;
1665 blocksize_mask = blocksize - 1;
1666 chunk_size = KIO_MAX_ATOMIC_IO << 10;
1667
1668 retval = -EINVAL;
1669 if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
1670 goto out_free;
1671 if (!have_mapping_directIO(mapping))
1672 goto out_free;
1673
1674 if ((rw == READ) && (offset + count > size))
1675 count = size - offset;
1676
1677 /*
1678 * Flush to disk exclusively the _data_, metadata must remain
1679 * completly asynchronous or performance will go to /dev/null.
1680 */
1681 retval = filemap_fdatasync(mapping);
1682 if (retval == 0)
1683 retval = fsync_inode_data_buffers(inode);
1684 if (retval == 0)
1685 retval = filemap_fdatawait(mapping);
1686 if (retval < 0)
1687 goto out_free;
1688
1689 progress = retval = 0;
1690 while (count > 0) {
1691 iosize = count;
1692 if (iosize > chunk_size)
1693 iosize = chunk_size;
1694
1695 retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1696 if (retval)
1697 break;
1698
1699 retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1700
1701 if (rw == READ && retval > 0)
1702 mark_dirty_kiobuf(iobuf, retval);
1703
1704 if (retval >= 0) {
1705 count -= retval;
1706 buf += retval;
1707 /* warning: weird semantics here, we're reporting a read behind the end of the file */
1708 progress += retval;
1709 }
1710
1711 unmap_kiobuf(iobuf);
1712
1713 if (retval != iosize)
1714 break;
1715 }
1716
1717 if (progress)
1718 retval = progress;
1719
1720 out_free:
1721 if (!new_iobuf)
1722 clear_bit(0, &filp->f_iobuf_lock);
1723 else
1724 free_kiovec(1, &iobuf);
1725 out:
1726 return retval;
1727 }
1728
file_read_actor(read_descriptor_t * desc,struct page * page,unsigned long offset,unsigned long size)1729 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1730 {
1731 char *kaddr;
1732 unsigned long left, count = desc->count;
1733
1734 if (size > count)
1735 size = count;
1736
1737 kaddr = kmap(page);
1738 left = __copy_to_user(desc->buf, kaddr + offset, size);
1739 kunmap(page);
1740
1741 if (left) {
1742 size -= left;
1743 desc->error = -EFAULT;
1744 }
1745 desc->count = count - size;
1746 desc->written += size;
1747 desc->buf += size;
1748 return size;
1749 }
1750
do_generic_direct_read(struct file * filp,char * buf,size_t count,loff_t * ppos)1751 inline ssize_t do_generic_direct_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1752 {
1753 ssize_t retval;
1754 loff_t pos = *ppos;
1755
1756 retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1757 if (retval > 0)
1758 *ppos = pos + retval;
1759 return retval;
1760 }
1761
1762 /*
1763 * This is the "read()" routine for all filesystems
1764 * that can use the page cache directly.
1765 */
generic_file_read(struct file * filp,char * buf,size_t count,loff_t * ppos)1766 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1767 {
1768 ssize_t retval;
1769
1770 if ((ssize_t) count < 0)
1771 return -EINVAL;
1772
1773 if (filp->f_flags & O_DIRECT)
1774 goto o_direct;
1775
1776 retval = -EFAULT;
1777 if (access_ok(VERIFY_WRITE, buf, count)) {
1778 retval = 0;
1779
1780 if (count) {
1781 read_descriptor_t desc;
1782
1783 desc.written = 0;
1784 desc.count = count;
1785 desc.buf = buf;
1786 desc.error = 0;
1787 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1788
1789 retval = desc.written;
1790 if (!retval)
1791 retval = desc.error;
1792 }
1793 }
1794 out:
1795 return retval;
1796
1797 o_direct:
1798 {
1799 loff_t size;
1800 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1801 struct inode *inode = mapping->host;
1802
1803 retval = 0;
1804 if (!count)
1805 goto out; /* skip atime */
1806 down_read(&inode->i_alloc_sem);
1807 down(&inode->i_sem);
1808 size = inode->i_size;
1809 if (*ppos < size)
1810 retval = do_generic_direct_read(filp, buf, count, ppos);
1811 up(&inode->i_sem);
1812 up_read(&inode->i_alloc_sem);
1813 UPDATE_ATIME(filp->f_dentry->d_inode);
1814 goto out;
1815 }
1816 }
1817
file_send_actor(read_descriptor_t * desc,struct page * page,unsigned long offset,unsigned long size)1818 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1819 {
1820 ssize_t written;
1821 unsigned long count = desc->count;
1822 struct file *file = (struct file *) desc->buf;
1823
1824 if (size > count)
1825 size = count;
1826
1827 if (file->f_op->sendpage) {
1828 written = file->f_op->sendpage(file, page, offset,
1829 size, &file->f_pos, size<count);
1830 } else {
1831 char *kaddr;
1832 mm_segment_t old_fs;
1833
1834 old_fs = get_fs();
1835 set_fs(KERNEL_DS);
1836
1837 kaddr = kmap(page);
1838 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1839 kunmap(page);
1840
1841 set_fs(old_fs);
1842 }
1843 if (written < 0) {
1844 desc->error = written;
1845 written = 0;
1846 }
1847 desc->count = count - written;
1848 desc->written += written;
1849 return written;
1850 }
1851
common_sendfile(int out_fd,int in_fd,loff_t * offset,size_t count)1852 static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count)
1853 {
1854 ssize_t retval;
1855 struct file * in_file, * out_file;
1856 struct inode * in_inode, * out_inode;
1857
1858 /*
1859 * Get input file, and verify that it is ok..
1860 */
1861 retval = -EBADF;
1862 in_file = fget(in_fd);
1863 if (!in_file)
1864 goto out;
1865 if (!(in_file->f_mode & FMODE_READ))
1866 goto fput_in;
1867 retval = -EINVAL;
1868 in_inode = in_file->f_dentry->d_inode;
1869 if (!in_inode)
1870 goto fput_in;
1871 if (!in_inode->i_mapping->a_ops->readpage)
1872 goto fput_in;
1873 retval = rw_verify_area(READ, in_file, &in_file->f_pos, count);
1874 if (retval)
1875 goto fput_in;
1876
1877 /*
1878 * Get output file, and verify that it is ok..
1879 */
1880 retval = -EBADF;
1881 out_file = fget(out_fd);
1882 if (!out_file)
1883 goto fput_in;
1884 if (!(out_file->f_mode & FMODE_WRITE))
1885 goto fput_out;
1886 retval = -EINVAL;
1887 if (!out_file->f_op || !out_file->f_op->write)
1888 goto fput_out;
1889 out_inode = out_file->f_dentry->d_inode;
1890 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
1891 if (retval)
1892 goto fput_out;
1893
1894 retval = 0;
1895 if (count) {
1896 read_descriptor_t desc;
1897
1898 if (!offset)
1899 offset = &in_file->f_pos;
1900
1901 desc.written = 0;
1902 desc.count = count;
1903 desc.buf = (char *) out_file;
1904 desc.error = 0;
1905 do_generic_file_read(in_file, offset, &desc, file_send_actor);
1906
1907 retval = desc.written;
1908 if (!retval)
1909 retval = desc.error;
1910 }
1911
1912 fput_out:
1913 fput(out_file);
1914 fput_in:
1915 fput(in_file);
1916 out:
1917 return retval;
1918 }
1919
sys_sendfile(int out_fd,int in_fd,off_t * offset,size_t count)1920 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1921 {
1922 loff_t pos, *ppos = NULL;
1923 ssize_t ret;
1924 if (offset) {
1925 off_t off;
1926 if (unlikely(get_user(off, offset)))
1927 return -EFAULT;
1928 pos = off;
1929 ppos = &pos;
1930 }
1931 ret = common_sendfile(out_fd, in_fd, ppos, count);
1932 if (offset)
1933 put_user((off_t)pos, offset);
1934 return ret;
1935 }
1936
sys_sendfile64(int out_fd,int in_fd,loff_t * offset,size_t count)1937 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count)
1938 {
1939 loff_t pos, *ppos = NULL;
1940 ssize_t ret;
1941 if (offset) {
1942 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1943 return -EFAULT;
1944 ppos = &pos;
1945 }
1946 ret = common_sendfile(out_fd, in_fd, ppos, count);
1947 if (offset)
1948 put_user(pos, offset);
1949 return ret;
1950 }
1951
do_readahead(struct file * file,unsigned long index,unsigned long nr)1952 static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
1953 {
1954 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1955 unsigned long max;
1956
1957 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1958 return -EINVAL;
1959
1960 /* Limit it to the size of the file.. */
1961 max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
1962 if (index > max)
1963 return 0;
1964 max -= index;
1965 if (nr > max)
1966 nr = max;
1967
1968 /* And limit it to a sane percentage of the inactive list.. */
1969 max = (nr_free_pages() + nr_inactive_pages) / 2;
1970 if (nr > max)
1971 nr = max;
1972
1973 while (nr) {
1974 page_cache_read(file, index);
1975 index++;
1976 nr--;
1977 }
1978 return 0;
1979 }
1980
sys_readahead(int fd,loff_t offset,size_t count)1981 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1982 {
1983 ssize_t ret;
1984 struct file *file;
1985
1986 ret = -EBADF;
1987 file = fget(fd);
1988 if (file) {
1989 if (file->f_mode & FMODE_READ) {
1990 unsigned long start = offset >> PAGE_CACHE_SHIFT;
1991 unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
1992 ret = do_readahead(file, start, len);
1993 }
1994 fput(file);
1995 }
1996 return ret;
1997 }
1998
1999 /*
2000 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
2001 * sure this is sequential access, we don't need a flexible read-ahead
2002 * window size -- we can always use a large fixed size window.
2003 */
nopage_sequential_readahead(struct vm_area_struct * vma,unsigned long pgoff,unsigned long filesize)2004 static void nopage_sequential_readahead(struct vm_area_struct * vma,
2005 unsigned long pgoff, unsigned long filesize)
2006 {
2007 unsigned long ra_window;
2008
2009 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
2010 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
2011
2012 /* vm_raend is zero if we haven't read ahead in this area yet. */
2013 if (vma->vm_raend == 0)
2014 vma->vm_raend = vma->vm_pgoff + ra_window;
2015
2016 /*
2017 * If we've just faulted the page half-way through our window,
2018 * then schedule reads for the next window, and release the
2019 * pages in the previous window.
2020 */
2021 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
2022 unsigned long start = vma->vm_pgoff + vma->vm_raend;
2023 unsigned long end = start + ra_window;
2024
2025 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
2026 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
2027 if (start > end)
2028 return;
2029
2030 while ((start < end) && (start < filesize)) {
2031 if (read_cluster_nonblocking(vma->vm_file,
2032 start, filesize) < 0)
2033 break;
2034 start += CLUSTER_PAGES;
2035 }
2036 run_task_queue(&tq_disk);
2037
2038 /* if we're far enough past the beginning of this area,
2039 recycle pages that are in the previous window. */
2040 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
2041 unsigned long window = ra_window << PAGE_SHIFT;
2042
2043 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
2044 end -= window + window;
2045 filemap_sync(vma, end - window, window, MS_INVALIDATE);
2046 }
2047
2048 vma->vm_raend += ra_window;
2049 }
2050
2051 return;
2052 }
2053
2054 /*
2055 * filemap_nopage() is invoked via the vma operations vector for a
2056 * mapped memory region to read in file data during a page fault.
2057 *
2058 * The goto's are kind of ugly, but this streamlines the normal case of having
2059 * it in the page cache, and handles the special cases reasonably without
2060 * having a lot of duplicated code.
2061 */
filemap_nopage(struct vm_area_struct * area,unsigned long address,int unused)2062 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
2063 {
2064 int error;
2065 struct file *file = area->vm_file;
2066 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2067 struct inode *inode = mapping->host;
2068 struct page *page, **hash;
2069 unsigned long size, pgoff, endoff;
2070
2071 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2072 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2073
2074 retry_all:
2075 /*
2076 * An external ptracer can access pages that normally aren't
2077 * accessible..
2078 */
2079 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2080 if ((pgoff >= size) && (area->vm_mm == current->mm))
2081 return NULL;
2082
2083 /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
2084 if (size > endoff)
2085 size = endoff;
2086
2087 /*
2088 * Do we have something in the page cache already?
2089 */
2090 hash = page_hash(mapping, pgoff);
2091 retry_find:
2092 page = __find_get_page(mapping, pgoff, hash);
2093 if (!page)
2094 goto no_cached_page;
2095
2096 /*
2097 * Ok, found a page in the page cache, now we need to check
2098 * that it's up-to-date.
2099 */
2100 if (!Page_Uptodate(page))
2101 goto page_not_uptodate;
2102
2103 success:
2104 /*
2105 * Try read-ahead for sequential areas.
2106 */
2107 if (VM_SequentialReadHint(area))
2108 nopage_sequential_readahead(area, pgoff, size);
2109
2110 /*
2111 * Found the page and have a reference on it, need to check sharing
2112 * and possibly copy it over to another page..
2113 */
2114 mark_page_accessed(page);
2115 flush_page_to_ram(page);
2116 return page;
2117
2118 no_cached_page:
2119 /*
2120 * If the requested offset is within our file, try to read a whole
2121 * cluster of pages at once.
2122 *
2123 * Otherwise, we're off the end of a privately mapped file,
2124 * so we need to map a zero page.
2125 */
2126 if ((pgoff < size) && !VM_RandomReadHint(area))
2127 error = read_cluster_nonblocking(file, pgoff, size);
2128 else
2129 error = page_cache_read(file, pgoff);
2130
2131 /*
2132 * The page we want has now been added to the page cache.
2133 * In the unlikely event that someone removed it in the
2134 * meantime, we'll just come back here and read it again.
2135 */
2136 if (error >= 0)
2137 goto retry_find;
2138
2139 /*
2140 * An error return from page_cache_read can result if the
2141 * system is low on memory, or a problem occurs while trying
2142 * to schedule I/O.
2143 */
2144 if (error == -ENOMEM)
2145 return NOPAGE_OOM;
2146 return NULL;
2147
2148 page_not_uptodate:
2149 lock_page(page);
2150
2151 /* Did it get unhashed while we waited for it? */
2152 if (!page->mapping) {
2153 UnlockPage(page);
2154 page_cache_release(page);
2155 goto retry_all;
2156 }
2157
2158 /* Did somebody else get it up-to-date? */
2159 if (Page_Uptodate(page)) {
2160 UnlockPage(page);
2161 goto success;
2162 }
2163
2164 if (!mapping->a_ops->readpage(file, page)) {
2165 wait_on_page(page);
2166 if (Page_Uptodate(page))
2167 goto success;
2168 }
2169
2170 /*
2171 * Umm, take care of errors if the page isn't up-to-date.
2172 * Try to re-read it _once_. We do this synchronously,
2173 * because there really aren't any performance issues here
2174 * and we need to check for errors.
2175 */
2176 lock_page(page);
2177
2178 /* Somebody truncated the page on us? */
2179 if (!page->mapping) {
2180 UnlockPage(page);
2181 page_cache_release(page);
2182 goto retry_all;
2183 }
2184
2185 /* Somebody else successfully read it in? */
2186 if (Page_Uptodate(page)) {
2187 UnlockPage(page);
2188 goto success;
2189 }
2190 ClearPageError(page);
2191 if (!mapping->a_ops->readpage(file, page)) {
2192 wait_on_page(page);
2193 if (Page_Uptodate(page))
2194 goto success;
2195 }
2196
2197 /*
2198 * Things didn't work out. Return zero to tell the
2199 * mm layer so, possibly freeing the page cache page first.
2200 */
2201 page_cache_release(page);
2202 return NULL;
2203 }
2204
2205 /* Called with mm->page_table_lock held to protect against other
2206 * threads/the swapper from ripping pte's out from under us.
2207 */
filemap_sync_pte(pte_t * ptep,struct vm_area_struct * vma,unsigned long address,unsigned int flags)2208 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
2209 unsigned long address, unsigned int flags)
2210 {
2211 pte_t pte = *ptep;
2212
2213 if (pte_present(pte)) {
2214 struct page *page = pte_page(pte);
2215 if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
2216 flush_tlb_page(vma, address);
2217 set_page_dirty(page);
2218 }
2219 }
2220 return 0;
2221 }
2222
filemap_sync_pte_range(pmd_t * pmd,unsigned long address,unsigned long size,struct vm_area_struct * vma,unsigned long offset,unsigned int flags)2223 static inline int filemap_sync_pte_range(pmd_t * pmd,
2224 unsigned long address, unsigned long size,
2225 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
2226 {
2227 pte_t * pte;
2228 unsigned long end;
2229 int error;
2230
2231 if (pmd_none(*pmd))
2232 return 0;
2233 if (pmd_bad(*pmd)) {
2234 pmd_ERROR(*pmd);
2235 pmd_clear(pmd);
2236 return 0;
2237 }
2238 pte = pte_offset(pmd, address);
2239 offset += address & PMD_MASK;
2240 address &= ~PMD_MASK;
2241 end = address + size;
2242 if (end > PMD_SIZE)
2243 end = PMD_SIZE;
2244 error = 0;
2245 do {
2246 error |= filemap_sync_pte(pte, vma, address + offset, flags);
2247 address += PAGE_SIZE;
2248 pte++;
2249 } while (address && (address < end));
2250 return error;
2251 }
2252
filemap_sync_pmd_range(pgd_t * pgd,unsigned long address,unsigned long size,struct vm_area_struct * vma,unsigned int flags)2253 static inline int filemap_sync_pmd_range(pgd_t * pgd,
2254 unsigned long address, unsigned long size,
2255 struct vm_area_struct *vma, unsigned int flags)
2256 {
2257 pmd_t * pmd;
2258 unsigned long offset, end;
2259 int error;
2260
2261 if (pgd_none(*pgd))
2262 return 0;
2263 if (pgd_bad(*pgd)) {
2264 pgd_ERROR(*pgd);
2265 pgd_clear(pgd);
2266 return 0;
2267 }
2268 pmd = pmd_offset(pgd, address);
2269 offset = address & PGDIR_MASK;
2270 address &= ~PGDIR_MASK;
2271 end = address + size;
2272 if (end > PGDIR_SIZE)
2273 end = PGDIR_SIZE;
2274 error = 0;
2275 do {
2276 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
2277 address = (address + PMD_SIZE) & PMD_MASK;
2278 pmd++;
2279 } while (address && (address < end));
2280 return error;
2281 }
2282
filemap_sync(struct vm_area_struct * vma,unsigned long address,size_t size,unsigned int flags)2283 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
2284 size_t size, unsigned int flags)
2285 {
2286 pgd_t * dir;
2287 unsigned long end = address + size;
2288 int error = 0;
2289
2290 /* Aquire the lock early; it may be possible to avoid dropping
2291 * and reaquiring it repeatedly.
2292 */
2293 spin_lock(&vma->vm_mm->page_table_lock);
2294
2295 dir = pgd_offset(vma->vm_mm, address);
2296 flush_cache_range(vma->vm_mm, end - size, end);
2297 if (address >= end)
2298 BUG();
2299 do {
2300 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
2301 address = (address + PGDIR_SIZE) & PGDIR_MASK;
2302 dir++;
2303 } while (address && (address < end));
2304 flush_tlb_range(vma->vm_mm, end - size, end);
2305
2306 spin_unlock(&vma->vm_mm->page_table_lock);
2307
2308 return error;
2309 }
2310
2311 static struct vm_operations_struct generic_file_vm_ops = {
2312 nopage: filemap_nopage,
2313 };
2314
2315 /* This is used for a general mmap of a disk file */
2316
generic_file_mmap(struct file * file,struct vm_area_struct * vma)2317 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2318 {
2319 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2320 struct inode *inode = mapping->host;
2321
2322 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
2323 if (!mapping->a_ops->writepage)
2324 return -EINVAL;
2325 }
2326 if (!mapping->a_ops->readpage)
2327 return -ENOEXEC;
2328 UPDATE_ATIME(inode);
2329 vma->vm_ops = &generic_file_vm_ops;
2330 return 0;
2331 }
2332
2333 /*
2334 * The msync() system call.
2335 */
2336
2337 /*
2338 * MS_SYNC syncs the entire file - including mappings.
2339 *
2340 * MS_ASYNC initiates writeout of just the dirty mapped data.
2341 * This provides no guarantee of file integrity - things like indirect
2342 * blocks may not have started writeout. MS_ASYNC is primarily useful
2343 * where the application knows that it has finished with the data and
2344 * wishes to intelligently schedule its own I/O traffic.
2345 */
msync_interval(struct vm_area_struct * vma,unsigned long start,unsigned long end,int flags)2346 static int msync_interval(struct vm_area_struct * vma,
2347 unsigned long start, unsigned long end, int flags)
2348 {
2349 int ret = 0;
2350 struct file * file = vma->vm_file;
2351
2352 if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
2353 return -EBUSY;
2354
2355 if (file && (vma->vm_flags & VM_SHARED)) {
2356 ret = filemap_sync(vma, start, end-start, flags);
2357
2358 if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
2359 struct inode * inode = file->f_dentry->d_inode;
2360
2361 down(&inode->i_sem);
2362 ret = filemap_fdatasync(inode->i_mapping);
2363 if (flags & MS_SYNC) {
2364 int err;
2365
2366 if (file->f_op && file->f_op->fsync) {
2367 err = file->f_op->fsync(file, file->f_dentry, 1);
2368 if (err && !ret)
2369 ret = err;
2370 }
2371 err = filemap_fdatawait(inode->i_mapping);
2372 if (err && !ret)
2373 ret = err;
2374 }
2375 up(&inode->i_sem);
2376 }
2377 }
2378 return ret;
2379 }
2380
sys_msync(unsigned long start,size_t len,int flags)2381 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
2382 {
2383 unsigned long end;
2384 struct vm_area_struct * vma;
2385 int unmapped_error, error = -EINVAL;
2386
2387 down_read(¤t->mm->mmap_sem);
2388 if (start & ~PAGE_MASK)
2389 goto out;
2390 len = (len + ~PAGE_MASK) & PAGE_MASK;
2391 end = start + len;
2392 if (end < start)
2393 goto out;
2394 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
2395 goto out;
2396 if ((flags & MS_ASYNC) && (flags & MS_SYNC))
2397 goto out;
2398
2399 error = 0;
2400 if (end == start)
2401 goto out;
2402 /*
2403 * If the interval [start,end) covers some unmapped address ranges,
2404 * just ignore them, but return -ENOMEM at the end.
2405 */
2406 vma = find_vma(current->mm, start);
2407 unmapped_error = 0;
2408 for (;;) {
2409 /* Still start < end. */
2410 error = -ENOMEM;
2411 if (!vma)
2412 goto out;
2413 /* Here start < vma->vm_end. */
2414 if (start < vma->vm_start) {
2415 unmapped_error = -ENOMEM;
2416 start = vma->vm_start;
2417 }
2418 /* Here vma->vm_start <= start < vma->vm_end. */
2419 if (end <= vma->vm_end) {
2420 if (start < end) {
2421 error = msync_interval(vma, start, end, flags);
2422 if (error)
2423 goto out;
2424 }
2425 error = unmapped_error;
2426 goto out;
2427 }
2428 /* Here vma->vm_start <= start < vma->vm_end < end. */
2429 error = msync_interval(vma, start, vma->vm_end, flags);
2430 if (error)
2431 goto out;
2432 start = vma->vm_end;
2433 vma = vma->vm_next;
2434 }
2435 out:
2436 up_read(¤t->mm->mmap_sem);
2437 return error;
2438 }
2439
setup_read_behavior(struct vm_area_struct * vma,int behavior)2440 static inline void setup_read_behavior(struct vm_area_struct * vma,
2441 int behavior)
2442 {
2443 VM_ClearReadHint(vma);
2444 switch(behavior) {
2445 case MADV_SEQUENTIAL:
2446 vma->vm_flags |= VM_SEQ_READ;
2447 break;
2448 case MADV_RANDOM:
2449 vma->vm_flags |= VM_RAND_READ;
2450 break;
2451 default:
2452 break;
2453 }
2454 return;
2455 }
2456
madvise_fixup_start(struct vm_area_struct * vma,unsigned long end,int behavior)2457 static long madvise_fixup_start(struct vm_area_struct * vma,
2458 unsigned long end, int behavior)
2459 {
2460 struct vm_area_struct * n;
2461 struct mm_struct * mm = vma->vm_mm;
2462
2463 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2464 if (!n)
2465 return -EAGAIN;
2466 *n = *vma;
2467 n->vm_end = end;
2468 setup_read_behavior(n, behavior);
2469 n->vm_raend = 0;
2470 if (n->vm_file)
2471 get_file(n->vm_file);
2472 if (n->vm_ops && n->vm_ops->open)
2473 n->vm_ops->open(n);
2474 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2475 lock_vma_mappings(vma);
2476 spin_lock(&mm->page_table_lock);
2477 vma->vm_start = end;
2478 __insert_vm_struct(mm, n);
2479 spin_unlock(&mm->page_table_lock);
2480 unlock_vma_mappings(vma);
2481 return 0;
2482 }
2483
madvise_fixup_end(struct vm_area_struct * vma,unsigned long start,int behavior)2484 static long madvise_fixup_end(struct vm_area_struct * vma,
2485 unsigned long start, int behavior)
2486 {
2487 struct vm_area_struct * n;
2488 struct mm_struct * mm = vma->vm_mm;
2489
2490 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2491 if (!n)
2492 return -EAGAIN;
2493 *n = *vma;
2494 n->vm_start = start;
2495 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2496 setup_read_behavior(n, behavior);
2497 n->vm_raend = 0;
2498 if (n->vm_file)
2499 get_file(n->vm_file);
2500 if (n->vm_ops && n->vm_ops->open)
2501 n->vm_ops->open(n);
2502 lock_vma_mappings(vma);
2503 spin_lock(&mm->page_table_lock);
2504 vma->vm_end = start;
2505 __insert_vm_struct(mm, n);
2506 spin_unlock(&mm->page_table_lock);
2507 unlock_vma_mappings(vma);
2508 return 0;
2509 }
2510
madvise_fixup_middle(struct vm_area_struct * vma,unsigned long start,unsigned long end,int behavior)2511 static long madvise_fixup_middle(struct vm_area_struct * vma,
2512 unsigned long start, unsigned long end, int behavior)
2513 {
2514 struct vm_area_struct * left, * right;
2515 struct mm_struct * mm = vma->vm_mm;
2516
2517 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2518 if (!left)
2519 return -EAGAIN;
2520 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2521 if (!right) {
2522 kmem_cache_free(vm_area_cachep, left);
2523 return -EAGAIN;
2524 }
2525 *left = *vma;
2526 *right = *vma;
2527 left->vm_end = start;
2528 right->vm_start = end;
2529 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2530 left->vm_raend = 0;
2531 right->vm_raend = 0;
2532 if (vma->vm_file)
2533 atomic_add(2, &vma->vm_file->f_count);
2534
2535 if (vma->vm_ops && vma->vm_ops->open) {
2536 vma->vm_ops->open(left);
2537 vma->vm_ops->open(right);
2538 }
2539 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2540 vma->vm_raend = 0;
2541 lock_vma_mappings(vma);
2542 spin_lock(&mm->page_table_lock);
2543 vma->vm_start = start;
2544 vma->vm_end = end;
2545 setup_read_behavior(vma, behavior);
2546 __insert_vm_struct(mm, left);
2547 __insert_vm_struct(mm, right);
2548 spin_unlock(&mm->page_table_lock);
2549 unlock_vma_mappings(vma);
2550 return 0;
2551 }
2552
2553 /*
2554 * We can potentially split a vm area into separate
2555 * areas, each area with its own behavior.
2556 */
madvise_behavior(struct vm_area_struct * vma,unsigned long start,unsigned long end,int behavior)2557 static long madvise_behavior(struct vm_area_struct * vma,
2558 unsigned long start, unsigned long end, int behavior)
2559 {
2560 int error = 0;
2561
2562 /* This caps the number of vma's this process can own */
2563 if (vma->vm_mm->map_count > max_map_count)
2564 return -ENOMEM;
2565
2566 if (start == vma->vm_start) {
2567 if (end == vma->vm_end) {
2568 setup_read_behavior(vma, behavior);
2569 vma->vm_raend = 0;
2570 } else
2571 error = madvise_fixup_start(vma, end, behavior);
2572 } else {
2573 if (end == vma->vm_end)
2574 error = madvise_fixup_end(vma, start, behavior);
2575 else
2576 error = madvise_fixup_middle(vma, start, end, behavior);
2577 }
2578
2579 return error;
2580 }
2581
2582 /*
2583 * Schedule all required I/O operations, then run the disk queue
2584 * to make sure they are started. Do not wait for completion.
2585 */
madvise_willneed(struct vm_area_struct * vma,unsigned long start,unsigned long end)2586 static long madvise_willneed(struct vm_area_struct * vma,
2587 unsigned long start, unsigned long end)
2588 {
2589 long error = -EBADF;
2590 struct file * file;
2591 struct inode * inode;
2592 unsigned long size;
2593
2594 /* Doesn't work if there's no mapped file. */
2595 if (!vma->vm_file)
2596 return error;
2597 file = vma->vm_file;
2598 inode = file->f_dentry->d_inode;
2599 if (!inode->i_mapping->a_ops->readpage)
2600 return error;
2601 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2602
2603 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2604 if (end > vma->vm_end)
2605 end = vma->vm_end;
2606 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2607
2608 error = -EIO;
2609
2610 /* round to cluster boundaries if this isn't a "random" area. */
2611 if (!VM_RandomReadHint(vma)) {
2612 start = CLUSTER_OFFSET(start);
2613 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2614
2615 while ((start < end) && (start < size)) {
2616 error = read_cluster_nonblocking(file, start, size);
2617 start += CLUSTER_PAGES;
2618 if (error < 0)
2619 break;
2620 }
2621 } else {
2622 while ((start < end) && (start < size)) {
2623 error = page_cache_read(file, start);
2624 start++;
2625 if (error < 0)
2626 break;
2627 }
2628 }
2629
2630 /* Don't wait for someone else to push these requests. */
2631 run_task_queue(&tq_disk);
2632
2633 return error;
2634 }
2635
2636 /*
2637 * Application no longer needs these pages. If the pages are dirty,
2638 * it's OK to just throw them away. The app will be more careful about
2639 * data it wants to keep. Be sure to free swap resources too. The
2640 * zap_page_range call sets things up for refill_inactive to actually free
2641 * these pages later if no one else has touched them in the meantime,
2642 * although we could add these pages to a global reuse list for
2643 * refill_inactive to pick up before reclaiming other pages.
2644 *
2645 * NB: This interface discards data rather than pushes it out to swap,
2646 * as some implementations do. This has performance implications for
2647 * applications like large transactional databases which want to discard
2648 * pages in anonymous maps after committing to backing store the data
2649 * that was kept in them. There is no reason to write this data out to
2650 * the swap area if the application is discarding it.
2651 *
2652 * An interface that causes the system to free clean pages and flush
2653 * dirty pages is already available as msync(MS_INVALIDATE).
2654 */
madvise_dontneed(struct vm_area_struct * vma,unsigned long start,unsigned long end)2655 static long madvise_dontneed(struct vm_area_struct * vma,
2656 unsigned long start, unsigned long end)
2657 {
2658 if (vma->vm_flags & VM_LOCKED)
2659 return -EINVAL;
2660
2661 zap_page_range(vma->vm_mm, start, end - start);
2662 return 0;
2663 }
2664
madvise_vma(struct vm_area_struct * vma,unsigned long start,unsigned long end,int behavior)2665 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2666 unsigned long end, int behavior)
2667 {
2668 long error = -EBADF;
2669
2670 switch (behavior) {
2671 case MADV_NORMAL:
2672 case MADV_SEQUENTIAL:
2673 case MADV_RANDOM:
2674 error = madvise_behavior(vma, start, end, behavior);
2675 break;
2676
2677 case MADV_WILLNEED:
2678 error = madvise_willneed(vma, start, end);
2679 break;
2680
2681 case MADV_DONTNEED:
2682 error = madvise_dontneed(vma, start, end);
2683 break;
2684
2685 default:
2686 error = -EINVAL;
2687 break;
2688 }
2689
2690 return error;
2691 }
2692
2693 /*
2694 * The madvise(2) system call.
2695 *
2696 * Applications can use madvise() to advise the kernel how it should
2697 * handle paging I/O in this VM area. The idea is to help the kernel
2698 * use appropriate read-ahead and caching techniques. The information
2699 * provided is advisory only, and can be safely disregarded by the
2700 * kernel without affecting the correct operation of the application.
2701 *
2702 * behavior values:
2703 * MADV_NORMAL - the default behavior is to read clusters. This
2704 * results in some read-ahead and read-behind.
2705 * MADV_RANDOM - the system should read the minimum amount of data
2706 * on any access, since it is unlikely that the appli-
2707 * cation will need more than what it asks for.
2708 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2709 * once, so they can be aggressively read ahead, and
2710 * can be freed soon after they are accessed.
2711 * MADV_WILLNEED - the application is notifying the system to read
2712 * some pages ahead.
2713 * MADV_DONTNEED - the application is finished with the given range,
2714 * so the kernel can free resources associated with it.
2715 *
2716 * return values:
2717 * zero - success
2718 * -EINVAL - start + len < 0, start is not page-aligned,
2719 * "behavior" is not a valid value, or application
2720 * is attempting to release locked or shared pages.
2721 * -ENOMEM - addresses in the specified range are not currently
2722 * mapped, or are outside the AS of the process.
2723 * -EIO - an I/O error occurred while paging in data.
2724 * -EBADF - map exists, but area maps something that isn't a file.
2725 * -EAGAIN - a kernel resource was temporarily unavailable.
2726 */
sys_madvise(unsigned long start,size_t len,int behavior)2727 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2728 {
2729 unsigned long end;
2730 struct vm_area_struct * vma;
2731 int unmapped_error = 0;
2732 int error = -EINVAL;
2733
2734 down_write(¤t->mm->mmap_sem);
2735
2736 if (start & ~PAGE_MASK)
2737 goto out;
2738 len = (len + ~PAGE_MASK) & PAGE_MASK;
2739 end = start + len;
2740 if (end < start)
2741 goto out;
2742
2743 error = 0;
2744 if (end == start)
2745 goto out;
2746
2747 /*
2748 * If the interval [start,end) covers some unmapped address
2749 * ranges, just ignore them, but return -ENOMEM at the end.
2750 */
2751 vma = find_vma(current->mm, start);
2752 for (;;) {
2753 /* Still start < end. */
2754 error = -ENOMEM;
2755 if (!vma)
2756 goto out;
2757
2758 /* Here start < vma->vm_end. */
2759 if (start < vma->vm_start) {
2760 unmapped_error = -ENOMEM;
2761 start = vma->vm_start;
2762 }
2763
2764 /* Here vma->vm_start <= start < vma->vm_end. */
2765 if (end <= vma->vm_end) {
2766 if (start < end) {
2767 error = madvise_vma(vma, start, end,
2768 behavior);
2769 if (error)
2770 goto out;
2771 }
2772 error = unmapped_error;
2773 goto out;
2774 }
2775
2776 /* Here vma->vm_start <= start < vma->vm_end < end. */
2777 error = madvise_vma(vma, start, vma->vm_end, behavior);
2778 if (error)
2779 goto out;
2780 start = vma->vm_end;
2781 vma = vma->vm_next;
2782 }
2783
2784 out:
2785 up_write(¤t->mm->mmap_sem);
2786 return error;
2787 }
2788
2789 /*
2790 * Later we can get more picky about what "in core" means precisely.
2791 * For now, simply check to see if the page is in the page cache,
2792 * and is up to date; i.e. that no page-in operation would be required
2793 * at this time if an application were to map and access this page.
2794 */
mincore_page(struct vm_area_struct * vma,unsigned long pgoff)2795 static unsigned char mincore_page(struct vm_area_struct * vma,
2796 unsigned long pgoff)
2797 {
2798 unsigned char present = 0;
2799 struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
2800 struct page * page, ** hash = page_hash(as, pgoff);
2801
2802 spin_lock(&pagecache_lock);
2803 page = __find_page_nolock(as, pgoff, *hash);
2804 if ((page) && (Page_Uptodate(page)))
2805 present = 1;
2806 spin_unlock(&pagecache_lock);
2807
2808 return present;
2809 }
2810
2811 /*
2812 * Do a chunk of "sys_mincore()". We've already checked
2813 * all the arguments, we hold the mmap semaphore: we should
2814 * just return the amount of info we're asked for.
2815 */
do_mincore(unsigned long addr,unsigned char * vec,unsigned long pages)2816 static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
2817 {
2818 unsigned long i, nr, pgoff;
2819 struct vm_area_struct *vma = find_vma(current->mm, addr);
2820
2821 /*
2822 * find_vma() didn't find anything above us, or we're
2823 * in an unmapped hole in the address space: ENOMEM.
2824 */
2825 if (!vma || addr < vma->vm_start)
2826 return -ENOMEM;
2827
2828 /*
2829 * Ok, got it. But check whether it's a segment we support
2830 * mincore() on. Right now, we don't do any anonymous mappings.
2831 *
2832 * FIXME: This is just stupid. And returning ENOMEM is
2833 * stupid too. We should just look at the page tables. But
2834 * this is what we've traditionally done, so we'll just
2835 * continue doing it.
2836 */
2837 if (!vma->vm_file)
2838 return -ENOMEM;
2839
2840 /*
2841 * Calculate how many pages there are left in the vma, and
2842 * what the pgoff is for our address.
2843 */
2844 nr = (vma->vm_end - addr) >> PAGE_SHIFT;
2845 if (nr > pages)
2846 nr = pages;
2847
2848 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
2849 pgoff += vma->vm_pgoff;
2850
2851 /* And then we just fill the sucker in.. */
2852 for (i = 0 ; i < nr; i++, pgoff++)
2853 vec[i] = mincore_page(vma, pgoff);
2854
2855 return nr;
2856 }
2857
2858 /*
2859 * The mincore(2) system call.
2860 *
2861 * mincore() returns the memory residency status of the pages in the
2862 * current process's address space specified by [addr, addr + len).
2863 * The status is returned in a vector of bytes. The least significant
2864 * bit of each byte is 1 if the referenced page is in memory, otherwise
2865 * it is zero.
2866 *
2867 * Because the status of a page can change after mincore() checks it
2868 * but before it returns to the application, the returned vector may
2869 * contain stale information. Only locked pages are guaranteed to
2870 * remain in memory.
2871 *
2872 * return values:
2873 * zero - success
2874 * -EFAULT - vec points to an illegal address
2875 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
2876 * -ENOMEM - Addresses in the range [addr, addr + len] are
2877 * invalid for the address space of this process, or
2878 * specify one or more pages which are not currently
2879 * mapped
2880 * -EAGAIN - A kernel resource was temporarily unavailable.
2881 */
sys_mincore(unsigned long start,size_t len,unsigned char * vec)2882 asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char *vec)
2883 {
2884 long retval;
2885 unsigned long pages;
2886 unsigned char *tmp;
2887
2888 /* Check the start address: needs to be page-aligned.. */
2889 if (start & ~PAGE_CACHE_MASK)
2890 return -EINVAL;
2891
2892 /* ..and we need to be passed a valid user-space range */
2893 if (!access_ok(VERIFY_READ, (void *) start, len))
2894 return -ENOMEM;
2895
2896 /* This also avoids any overflows on PAGE_CACHE_ALIGN */
2897 pages = len >> PAGE_SHIFT;
2898 pages += (len & ~PAGE_MASK) != 0;
2899
2900 if (!access_ok(VERIFY_WRITE, vec, pages))
2901 return -EFAULT;
2902
2903 tmp = (void *) __get_free_page(GFP_USER);
2904 if (!tmp)
2905 return -EAGAIN;
2906
2907 retval = 0;
2908 while (pages) {
2909 /*
2910 * Do at most PAGE_SIZE entries per iteration, due to
2911 * the temporary buffer size.
2912 */
2913 down_read(¤t->mm->mmap_sem);
2914 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
2915 up_read(¤t->mm->mmap_sem);
2916
2917 if (retval <= 0)
2918 break;
2919 if (copy_to_user(vec, tmp, retval)) {
2920 retval = -EFAULT;
2921 break;
2922 }
2923 pages -= retval;
2924 vec += retval;
2925 start += retval << PAGE_SHIFT;
2926 retval = 0;
2927 }
2928 free_page((unsigned long) tmp);
2929 return retval;
2930 }
2931
2932 static inline
__read_cache_page(struct address_space * mapping,unsigned long index,int (* filler)(void *,struct page *),void * data)2933 struct page *__read_cache_page(struct address_space *mapping,
2934 unsigned long index,
2935 int (*filler)(void *,struct page*),
2936 void *data)
2937 {
2938 struct page **hash = page_hash(mapping, index);
2939 struct page *page, *cached_page = NULL;
2940 int err;
2941 repeat:
2942 page = __find_get_page(mapping, index, hash);
2943 if (!page) {
2944 if (!cached_page) {
2945 cached_page = page_cache_alloc(mapping);
2946 if (!cached_page)
2947 return ERR_PTR(-ENOMEM);
2948 }
2949 page = cached_page;
2950 if (add_to_page_cache_unique(page, mapping, index, hash))
2951 goto repeat;
2952 cached_page = NULL;
2953 err = filler(data, page);
2954 if (err < 0) {
2955 page_cache_release(page);
2956 page = ERR_PTR(err);
2957 }
2958 }
2959 if (cached_page)
2960 page_cache_release(cached_page);
2961 return page;
2962 }
2963
2964 /*
2965 * Read into the page cache. If a page already exists,
2966 * and Page_Uptodate() is not set, try to fill the page.
2967 */
read_cache_page(struct address_space * mapping,unsigned long index,int (* filler)(void *,struct page *),void * data)2968 struct page *read_cache_page(struct address_space *mapping,
2969 unsigned long index,
2970 int (*filler)(void *,struct page*),
2971 void *data)
2972 {
2973 struct page *page;
2974 int err;
2975
2976 retry:
2977 page = __read_cache_page(mapping, index, filler, data);
2978 if (IS_ERR(page))
2979 goto out;
2980 mark_page_accessed(page);
2981 if (Page_Uptodate(page))
2982 goto out;
2983
2984 lock_page(page);
2985 if (!page->mapping) {
2986 UnlockPage(page);
2987 page_cache_release(page);
2988 goto retry;
2989 }
2990 if (Page_Uptodate(page)) {
2991 UnlockPage(page);
2992 goto out;
2993 }
2994 err = filler(data, page);
2995 if (err < 0) {
2996 page_cache_release(page);
2997 page = ERR_PTR(err);
2998 }
2999 out:
3000 return page;
3001 }
3002
__grab_cache_page(struct address_space * mapping,unsigned long index,struct page ** cached_page)3003 static inline struct page * __grab_cache_page(struct address_space *mapping,
3004 unsigned long index, struct page **cached_page)
3005 {
3006 struct page *page, **hash = page_hash(mapping, index);
3007 repeat:
3008 page = __find_lock_page(mapping, index, hash);
3009 if (!page) {
3010 if (!*cached_page) {
3011 *cached_page = page_cache_alloc(mapping);
3012 if (!*cached_page)
3013 return NULL;
3014 }
3015 page = *cached_page;
3016 if (add_to_page_cache_unique(page, mapping, index, hash))
3017 goto repeat;
3018 *cached_page = NULL;
3019 }
3020 return page;
3021 }
3022
remove_suid(struct inode * inode)3023 inline void remove_suid(struct inode *inode)
3024 {
3025 unsigned int mode;
3026
3027 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
3028 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
3029
3030 /* was any of the uid bits set? */
3031 mode &= inode->i_mode;
3032 if (mode && !capable(CAP_FSETID)) {
3033 inode->i_mode &= ~mode;
3034 mark_inode_dirty(inode);
3035 }
3036 }
3037
3038 /*
3039 * precheck_file_write():
3040 * Check the conditions on a file descriptor prior to beginning a write
3041 * on it. Contains the common precheck code for both buffered and direct
3042 * IO.
3043 */
precheck_file_write(struct file * file,struct inode * inode,size_t * count,loff_t * ppos)3044 int precheck_file_write(struct file *file, struct inode *inode,
3045 size_t *count, loff_t *ppos)
3046 {
3047 ssize_t err;
3048 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
3049 loff_t pos = *ppos;
3050
3051 err = -EINVAL;
3052 if (pos < 0)
3053 goto out;
3054
3055 err = file->f_error;
3056 if (err) {
3057 file->f_error = 0;
3058 goto out;
3059 }
3060
3061 /* FIXME: this is for backwards compatibility with 2.4 */
3062 if (!S_ISBLK(inode->i_mode) && (file->f_flags & O_APPEND))
3063 *ppos = pos = inode->i_size;
3064
3065 /*
3066 * Check whether we've reached the file size limit.
3067 */
3068 err = -EFBIG;
3069
3070 if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
3071 if (pos >= limit) {
3072 send_sig(SIGXFSZ, current, 0);
3073 goto out;
3074 }
3075 if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
3076 /* send_sig(SIGXFSZ, current, 0); */
3077 *count = limit - (u32)pos;
3078 }
3079 }
3080
3081 /*
3082 * LFS rule
3083 */
3084 if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
3085 if (pos >= MAX_NON_LFS) {
3086 send_sig(SIGXFSZ, current, 0);
3087 goto out;
3088 }
3089 if (*count > MAX_NON_LFS - (u32)pos) {
3090 /* send_sig(SIGXFSZ, current, 0); */
3091 *count = MAX_NON_LFS - (u32)pos;
3092 }
3093 }
3094
3095 /*
3096 * Are we about to exceed the fs block limit ?
3097 *
3098 * If we have written data it becomes a short write
3099 * If we have exceeded without writing data we send
3100 * a signal and give them an EFBIG.
3101 *
3102 * Linus frestrict idea will clean these up nicely..
3103 */
3104
3105 if (!S_ISBLK(inode->i_mode)) {
3106 if (pos >= inode->i_sb->s_maxbytes)
3107 {
3108 if (*count || pos > inode->i_sb->s_maxbytes) {
3109 send_sig(SIGXFSZ, current, 0);
3110 err = -EFBIG;
3111 goto out;
3112 }
3113 /* zero-length writes at ->s_maxbytes are OK */
3114 }
3115
3116 if (pos + *count > inode->i_sb->s_maxbytes)
3117 *count = inode->i_sb->s_maxbytes - pos;
3118 } else {
3119 if (is_read_only(inode->i_rdev)) {
3120 err = -EPERM;
3121 goto out;
3122 }
3123 if (pos >= inode->i_size) {
3124 if (*count || pos > inode->i_size) {
3125 err = -ENOSPC;
3126 goto out;
3127 }
3128 }
3129
3130 if (pos + *count > inode->i_size)
3131 *count = inode->i_size - pos;
3132 }
3133
3134 err = 0;
3135 out:
3136 return err;
3137 }
3138
3139 /*
3140 * Write to a file through the page cache.
3141 *
3142 * We currently put everything into the page cache prior to writing it.
3143 * This is not a problem when writing full pages. With partial pages,
3144 * however, we first have to read the data into the cache, then
3145 * dirty the page, and finally schedule it for writing. Alternatively, we
3146 * could write-through just the portion of data that would go into that
3147 * page, but that would kill performance for applications that write data
3148 * line by line, and it's prone to race conditions.
3149 *
3150 * Note that this routine doesn't try to keep track of dirty pages. Each
3151 * file system has to do this all by itself, unfortunately.
3152 * okir@monad.swb.de
3153 */
3154 ssize_t
do_generic_file_write(struct file * file,const char * buf,size_t count,loff_t * ppos)3155 do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3156 {
3157 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3158 struct inode *inode = mapping->host;
3159 loff_t pos;
3160 struct page *page, *cached_page;
3161 ssize_t written;
3162 long status = 0;
3163 ssize_t err;
3164 unsigned bytes;
3165
3166 cached_page = NULL;
3167 pos = *ppos;
3168 written = 0;
3169
3170 err = precheck_file_write(file, inode, &count, &pos);
3171 if (err != 0 || count == 0)
3172 goto out;
3173
3174 remove_suid(inode);
3175 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3176 mark_inode_dirty_sync(inode);
3177
3178 do {
3179 unsigned long index, offset;
3180 long page_fault;
3181 char *kaddr;
3182
3183 /*
3184 * Try to find the page in the cache. If it isn't there,
3185 * allocate a free page.
3186 */
3187 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
3188 index = pos >> PAGE_CACHE_SHIFT;
3189 bytes = PAGE_CACHE_SIZE - offset;
3190 if (bytes > count)
3191 bytes = count;
3192
3193 /*
3194 * Bring in the user page that we will copy from _first_.
3195 * Otherwise there's a nasty deadlock on copying from the
3196 * same page as we're writing to, without it being marked
3197 * up-to-date.
3198 */
3199 { volatile unsigned char dummy;
3200 __get_user(dummy, buf);
3201 __get_user(dummy, buf+bytes-1);
3202 }
3203
3204 status = -ENOMEM; /* we'll assign it later anyway */
3205 page = __grab_cache_page(mapping, index, &cached_page);
3206 if (!page)
3207 break;
3208
3209 /* We have exclusive IO access to the page.. */
3210 if (!PageLocked(page)) {
3211 PAGE_BUG(page);
3212 }
3213
3214 kaddr = kmap(page);
3215 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
3216 if (status)
3217 goto sync_failure;
3218 page_fault = __copy_from_user(kaddr+offset, buf, bytes);
3219 flush_dcache_page(page);
3220 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
3221 if (page_fault)
3222 goto fail_write;
3223 if (!status)
3224 status = bytes;
3225
3226 if (status >= 0) {
3227 written += status;
3228 count -= status;
3229 pos += status;
3230 buf += status;
3231 }
3232 unlock:
3233 kunmap(page);
3234 /* Mark it unlocked again and drop the page.. */
3235 SetPageReferenced(page);
3236 UnlockPage(page);
3237 page_cache_release(page);
3238
3239 if (status < 0)
3240 break;
3241 } while (count);
3242 done:
3243 *ppos = pos;
3244
3245 if (cached_page)
3246 page_cache_release(cached_page);
3247
3248 /* For now, when the user asks for O_SYNC, we'll actually
3249 * provide O_DSYNC. */
3250 if (status >= 0) {
3251 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
3252 status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
3253 }
3254
3255 err = written ? written : status;
3256 out:
3257
3258 return err;
3259 fail_write:
3260 status = -EFAULT;
3261 goto unlock;
3262
3263 sync_failure:
3264 /*
3265 * If blocksize < pagesize, prepare_write() may have instantiated a
3266 * few blocks outside i_size. Trim these off again.
3267 */
3268 kunmap(page);
3269 UnlockPage(page);
3270 page_cache_release(page);
3271 if (pos + bytes > inode->i_size)
3272 vmtruncate(inode, inode->i_size);
3273 goto done;
3274 }
3275
3276 ssize_t
do_generic_direct_write(struct file * file,const char * buf,size_t count,loff_t * ppos)3277 do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3278 {
3279 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3280 struct inode *inode = mapping->host;
3281 loff_t pos;
3282 ssize_t written;
3283 long status = 0;
3284 ssize_t err;
3285
3286 pos = *ppos;
3287 written = 0;
3288
3289 err = precheck_file_write(file, inode, &count, &pos);
3290 if (err != 0 || count == 0)
3291 goto out;
3292
3293 if (!(file->f_flags & O_DIRECT))
3294 BUG();
3295
3296 remove_suid(inode);
3297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3298 mark_inode_dirty_sync(inode);
3299
3300 written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
3301 if (written > 0) {
3302 loff_t end = pos + written;
3303 if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
3304 inode->i_size = end;
3305 mark_inode_dirty(inode);
3306 }
3307 *ppos = end;
3308 invalidate_inode_pages2(mapping);
3309 }
3310 /*
3311 * Sync the fs metadata but not the minor inode changes and
3312 * of course not the data as we did direct DMA for the IO.
3313 */
3314 if (written >= 0 && (file->f_flags & O_SYNC))
3315 status = generic_osync_inode(inode, OSYNC_METADATA);
3316
3317 err = written ? written : status;
3318 out:
3319 return err;
3320 }
3321
do_odirect_fallback(struct file * file,struct inode * inode,const char * buf,size_t count,loff_t * ppos)3322 static int do_odirect_fallback(struct file *file, struct inode *inode,
3323 const char *buf, size_t count, loff_t *ppos)
3324 {
3325 ssize_t ret;
3326 int err;
3327
3328 down(&inode->i_sem);
3329 ret = do_generic_file_write(file, buf, count, ppos);
3330 if (ret > 0) {
3331 err = do_fdatasync(file);
3332 if (err)
3333 ret = err;
3334 }
3335 up(&inode->i_sem);
3336 return ret;
3337 }
3338
3339 ssize_t
generic_file_write(struct file * file,const char * buf,size_t count,loff_t * ppos)3340 generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3341 {
3342 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
3343 ssize_t err;
3344
3345 if ((ssize_t) count < 0)
3346 return -EINVAL;
3347
3348 if (!access_ok(VERIFY_READ, buf, count))
3349 return -EFAULT;
3350
3351 if (file->f_flags & O_DIRECT) {
3352 /* do_generic_direct_write may drop i_sem during the
3353 actual IO */
3354 down_read(&inode->i_alloc_sem);
3355 down(&inode->i_sem);
3356 err = do_generic_direct_write(file, buf, count, ppos);
3357 up(&inode->i_sem);
3358 up_read(&inode->i_alloc_sem);
3359 if (unlikely(err == -ENOTBLK))
3360 err = do_odirect_fallback(file, inode, buf, count, ppos);
3361 } else {
3362 down(&inode->i_sem);
3363 err = do_generic_file_write(file, buf, count, ppos);
3364 up(&inode->i_sem);
3365 }
3366
3367 return err;
3368 }
3369
page_cache_init(unsigned long mempages)3370 void __init page_cache_init(unsigned long mempages)
3371 {
3372 unsigned long htable_size, order;
3373
3374 htable_size = mempages;
3375 htable_size *= sizeof(struct page *);
3376 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
3377 ;
3378
3379 do {
3380 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
3381
3382 page_hash_bits = 0;
3383 while((tmp >>= 1UL) != 0UL)
3384 page_hash_bits++;
3385
3386 page_hash_table = (struct page **)
3387 __get_free_pages(GFP_ATOMIC, order);
3388 } while(page_hash_table == NULL && --order > 0);
3389
3390 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
3391 (1 << page_hash_bits), order, (PAGE_SIZE << order));
3392 if (!page_hash_table)
3393 panic("Failed to allocate page hash table\n");
3394 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
3395 }
3396