1 /*
2 * linux/mm/vmscan.c
3 *
4 * The pageout daemon, decides which pages to evict (swap out) and
5 * does the actual work of freeing them.
6 *
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 *
9 * Swap reorganised 29.12.95, Stephen Tweedie.
10 * kswapd added: 7.1.96 sct
11 * Removed kswapd_ctl limits, and swap out as many pages as needed
12 * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
13 * Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14 * Multiqueue VM started 5.8.00, Rik van Riel.
15 */
16
17 #include <linux/slab.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/swapctl.h>
21 #include <linux/smp_lock.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/highmem.h>
25 #include <linux/file.h>
26
27 #include <asm/pgalloc.h>
28
29 /*
30 * "vm_passes" is the number of vm passes before failing the
31 * memory balancing. Take into account 3 passes are needed
32 * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
33 * of the inactive list at each pass.
34 */
35 int vm_passes = 60;
36
37 /*
38 * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
39 * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
40 * scan 1/6 of the inactive lists during a normal aging round.
41 */
42 int vm_cache_scan_ratio = 6;
43
44 /*
45 * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
46 * we'll start to pageout.
47 */
48 int vm_mapped_ratio = 100;
49
50 /*
51 * "vm_lru_balance_ratio" controls the balance between active and
52 * inactive cache. The bigger vm_balance is, the easier the
53 * active cache will grow, because we'll rotate the active list
54 * slowly. A value of 2 means we'll go towards a balance of
55 * 1/3 of the cache being inactive.
56 */
57 int vm_lru_balance_ratio = 2;
58
59 /*
60 * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
61 * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
62 * the unused-inode, dentry and dquot caches will be freed during a normal
63 * aging round.
64 */
65 int vm_vfs_scan_ratio = 6;
66
67 /*
68 * "vm_anon_lru" select if to immdiatly insert anon pages in the
69 * lru. Immediatly means as soon as they're allocated during the
70 * page faults.
71 *
72 * If this is set to 0, they're inserted only after the first
73 * swapout.
74 *
75 * Having anon pages immediatly inserted in the lru allows the
76 * VM to know better when it's worthwhile to start swapping
77 * anonymous ram, it will start to swap earlier and it should
78 * swap smoother and faster, but it will decrease scalability
79 * on the >16-ways of an order of magnitude. Big SMP/NUMA
80 * definitely can't take an hit on a global spinlock at
81 * every anon page allocation. So this is off by default.
82 *
83 * Low ram machines that swaps all the time want to turn
84 * this on (i.e. set to 1).
85 */
86 int vm_anon_lru = 0;
87
88 /*
89 * The swap-out function returns 1 if it successfully
90 * scanned all the pages it was asked to (`count').
91 * It returns zero if it couldn't do anything,
92 *
93 * rss may decrease because pages are shared, but this
94 * doesn't count as having freed a page.
95 */
96
97 /* mm->page_table_lock is held. mmap_sem is not held */
try_to_swap_out(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long address,pte_t * page_table,struct page * page,zone_t * classzone)98 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
99 {
100 pte_t pte;
101 swp_entry_t entry;
102
103 /* Don't look at this pte if it's been accessed recently. */
104 if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
105 mark_page_accessed(page);
106 return 0;
107 }
108
109 /* Don't bother unmapping pages that are active */
110 if (PageActive(page))
111 return 0;
112
113 /* Don't bother replenishing zones not under pressure.. */
114 if (!memclass(page_zone(page), classzone))
115 return 0;
116
117 if (TryLockPage(page))
118 return 0;
119
120 /* From this point on, the odds are that we're going to
121 * nuke this pte, so read and clear the pte. This hook
122 * is needed on CPUs which update the accessed and dirty
123 * bits in hardware.
124 */
125 flush_cache_page(vma, address);
126 pte = ptep_get_and_clear(page_table);
127 flush_tlb_page(vma, address);
128
129 if (pte_dirty(pte))
130 set_page_dirty(page);
131
132 /*
133 * Is the page already in the swap cache? If so, then
134 * we can just drop our reference to it without doing
135 * any IO - it's already up-to-date on disk.
136 */
137 if (PageSwapCache(page)) {
138 entry.val = page->index;
139 swap_duplicate(entry);
140 set_swap_pte:
141 set_pte(page_table, swp_entry_to_pte(entry));
142 drop_pte:
143 mm->rss--;
144 UnlockPage(page);
145 {
146 int freeable = page_count(page) - !!page->buffers <= 2;
147 page_cache_release(page);
148 return freeable;
149 }
150 }
151
152 /*
153 * Is it a clean page? Then it must be recoverable
154 * by just paging it in again, and we can just drop
155 * it.. or if it's dirty but has backing store,
156 * just mark the page dirty and drop it.
157 *
158 * However, this won't actually free any real
159 * memory, as the page will just be in the page cache
160 * somewhere, and as such we should just continue
161 * our scan.
162 *
163 * Basically, this just makes it possible for us to do
164 * some real work in the future in "refill_inactive()".
165 */
166 if (page->mapping)
167 goto drop_pte;
168 if (!PageDirty(page))
169 goto drop_pte;
170
171 /*
172 * Anonymous buffercache pages can be left behind by
173 * concurrent truncate and pagefault.
174 */
175 if (page->buffers)
176 goto preserve;
177
178 /*
179 * This is a dirty, swappable page. First of all,
180 * get a suitable swap entry for it, and make sure
181 * we have the swap cache set up to associate the
182 * page with that swap entry.
183 */
184 for (;;) {
185 entry = get_swap_page();
186 if (!entry.val)
187 break;
188 /* Add it to the swap cache and mark it dirty
189 * (adding to the page cache will clear the dirty
190 * and uptodate bits, so we need to do it again)
191 */
192 if (add_to_swap_cache(page, entry) == 0) {
193 SetPageUptodate(page);
194 set_page_dirty(page);
195 goto set_swap_pte;
196 }
197 /* Raced with "speculative" read_swap_cache_async */
198 swap_free(entry);
199 }
200
201 /* No swap space left */
202 preserve:
203 set_pte(page_table, pte);
204 UnlockPage(page);
205 return 0;
206 }
207
208 /* mm->page_table_lock is held. mmap_sem is not held */
swap_out_pmd(struct mm_struct * mm,struct vm_area_struct * vma,pmd_t * dir,unsigned long address,unsigned long end,int count,zone_t * classzone)209 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
210 {
211 pte_t * pte;
212 unsigned long pmd_end;
213
214 if (pmd_none(*dir))
215 return count;
216 if (pmd_bad(*dir)) {
217 pmd_ERROR(*dir);
218 pmd_clear(dir);
219 return count;
220 }
221
222 pte = pte_offset(dir, address);
223
224 pmd_end = (address + PMD_SIZE) & PMD_MASK;
225 if (end > pmd_end)
226 end = pmd_end;
227
228 do {
229 if (pte_present(*pte)) {
230 struct page *page = pte_page(*pte);
231
232 if (VALID_PAGE(page) && !PageReserved(page)) {
233 count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
234 if (!count) {
235 address += PAGE_SIZE;
236 break;
237 }
238 }
239 }
240 address += PAGE_SIZE;
241 pte++;
242 } while (address && (address < end));
243 mm->swap_address = address;
244 return count;
245 }
246
247 /* mm->page_table_lock is held. mmap_sem is not held */
swap_out_pgd(struct mm_struct * mm,struct vm_area_struct * vma,pgd_t * dir,unsigned long address,unsigned long end,int count,zone_t * classzone)248 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
249 {
250 pmd_t * pmd;
251 unsigned long pgd_end;
252
253 if (pgd_none(*dir))
254 return count;
255 if (pgd_bad(*dir)) {
256 pgd_ERROR(*dir);
257 pgd_clear(dir);
258 return count;
259 }
260
261 pmd = pmd_offset(dir, address);
262
263 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
264 if (pgd_end && (end > pgd_end))
265 end = pgd_end;
266
267 do {
268 count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
269 if (!count)
270 break;
271 address = (address + PMD_SIZE) & PMD_MASK;
272 pmd++;
273 } while (address && (address < end));
274 return count;
275 }
276
277 /* mm->page_table_lock is held. mmap_sem is not held */
swap_out_vma(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long address,int count,zone_t * classzone)278 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
279 {
280 pgd_t *pgdir;
281 unsigned long end;
282
283 /* Don't swap out areas which are reserved */
284 if (vma->vm_flags & VM_RESERVED)
285 return count;
286
287 pgdir = pgd_offset(mm, address);
288
289 end = vma->vm_end;
290 BUG_ON(address >= end);
291 do {
292 count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
293 if (!count)
294 break;
295 address = (address + PGDIR_SIZE) & PGDIR_MASK;
296 pgdir++;
297 } while (address && (address < end));
298 return count;
299 }
300
301 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
302 struct mm_struct *swap_mm = &init_mm;
303
304 /*
305 * Returns remaining count of pages to be swapped out by followup call.
306 */
swap_out_mm(struct mm_struct * mm,int count,int * mmcounter,zone_t * classzone)307 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
308 {
309 unsigned long address;
310 struct vm_area_struct* vma;
311
312 /*
313 * Find the proper vm-area after freezing the vma chain
314 * and ptes.
315 */
316 spin_lock(&mm->page_table_lock);
317 address = mm->swap_address;
318 if (address == TASK_SIZE || swap_mm != mm) {
319 /* We raced: don't count this mm but try again */
320 ++*mmcounter;
321 goto out_unlock;
322 }
323 vma = find_vma(mm, address);
324 if (vma) {
325 if (address < vma->vm_start)
326 address = vma->vm_start;
327
328 for (;;) {
329 count = swap_out_vma(mm, vma, address, count, classzone);
330 vma = vma->vm_next;
331 if (!vma)
332 break;
333 if (!count)
334 goto out_unlock;
335 address = vma->vm_start;
336 }
337 }
338 /* Indicate that we reached the end of address space */
339 mm->swap_address = TASK_SIZE;
340
341 out_unlock:
342 spin_unlock(&mm->page_table_lock);
343 return count;
344 }
345
346 static int FASTCALL(swap_out(zone_t * classzone));
swap_out(zone_t * classzone)347 static int fastcall swap_out(zone_t * classzone)
348 {
349 int counter, nr_pages = SWAP_CLUSTER_MAX;
350 struct mm_struct *mm;
351
352 counter = mmlist_nr << 1;
353 do {
354 if (unlikely(current->need_resched)) {
355 __set_current_state(TASK_RUNNING);
356 schedule();
357 }
358
359 spin_lock(&mmlist_lock);
360 mm = swap_mm;
361 while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
362 mm->swap_address = 0;
363 mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
364 if (mm == swap_mm)
365 goto empty;
366 swap_mm = mm;
367 }
368
369 /* Make sure the mm doesn't disappear when we drop the lock.. */
370 atomic_inc(&mm->mm_users);
371 spin_unlock(&mmlist_lock);
372
373 nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
374
375 mmput(mm);
376
377 if (!nr_pages)
378 return 1;
379 } while (--counter >= 0);
380
381 return 0;
382
383 empty:
384 spin_unlock(&mmlist_lock);
385 return 0;
386 }
387
388 static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
389 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
shrink_cache(int nr_pages,zone_t * classzone,unsigned int gfp_mask,int * failed_swapout)390 static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
391 {
392 struct list_head * entry;
393 int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
394 int max_mapped = vm_mapped_ratio * nr_pages;
395
396 while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
397 struct page * page;
398
399 if (unlikely(current->need_resched)) {
400 spin_unlock(&pagemap_lru_lock);
401 __set_current_state(TASK_RUNNING);
402 schedule();
403 spin_lock(&pagemap_lru_lock);
404 continue;
405 }
406
407 page = list_entry(entry, struct page, lru);
408
409 BUG_ON(!PageLRU(page));
410 BUG_ON(PageActive(page));
411
412 list_del(entry);
413 list_add(entry, &inactive_list);
414
415 /*
416 * Zero page counts can happen because we unlink the pages
417 * _after_ decrementing the usage count..
418 */
419 if (unlikely(!page_count(page)))
420 continue;
421
422 if (!memclass(page_zone(page), classzone))
423 continue;
424
425 max_scan--;
426
427 /* Racy check to avoid trylocking when not worthwhile */
428 if (!page->buffers && (page_count(page) != 1 || !page->mapping))
429 goto page_mapped;
430
431 /*
432 * The page is locked. IO in progress?
433 * Move it to the back of the list.
434 */
435 if (unlikely(TryLockPage(page))) {
436 if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
437 page_cache_get(page);
438 spin_unlock(&pagemap_lru_lock);
439 wait_on_page(page);
440 page_cache_release(page);
441 spin_lock(&pagemap_lru_lock);
442 }
443 continue;
444 }
445
446 if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
447 /*
448 * It is not critical here to write it only if
449 * the page is unmapped beause any direct writer
450 * like O_DIRECT would set the PG_dirty bitflag
451 * on the phisical page after having successfully
452 * pinned it and after the I/O to the page is finished,
453 * so the direct writes to the page cannot get lost.
454 */
455 int (*writepage)(struct page *);
456
457 writepage = page->mapping->a_ops->writepage;
458 if ((gfp_mask & __GFP_FS) && writepage) {
459 ClearPageDirty(page);
460 SetPageLaunder(page);
461 page_cache_get(page);
462 spin_unlock(&pagemap_lru_lock);
463
464 writepage(page);
465 page_cache_release(page);
466
467 spin_lock(&pagemap_lru_lock);
468 continue;
469 }
470 }
471
472 /*
473 * If the page has buffers, try to free the buffer mappings
474 * associated with this page. If we succeed we try to free
475 * the page as well.
476 */
477 if (page->buffers) {
478 spin_unlock(&pagemap_lru_lock);
479
480 /* avoid to free a locked page */
481 page_cache_get(page);
482
483 if (try_to_release_page(page, gfp_mask)) {
484 if (!page->mapping) {
485 /*
486 * We must not allow an anon page
487 * with no buffers to be visible on
488 * the LRU, so we unlock the page after
489 * taking the lru lock
490 */
491 spin_lock(&pagemap_lru_lock);
492 UnlockPage(page);
493 __lru_cache_del(page);
494
495 /* effectively free the page here */
496 page_cache_release(page);
497
498 if (--nr_pages)
499 continue;
500 break;
501 } else {
502 /*
503 * The page is still in pagecache so undo the stuff
504 * before the try_to_release_page since we've not
505 * finished and we can now try the next step.
506 */
507 page_cache_release(page);
508
509 spin_lock(&pagemap_lru_lock);
510 }
511 } else {
512 /* failed to drop the buffers so stop here */
513 UnlockPage(page);
514 page_cache_release(page);
515
516 spin_lock(&pagemap_lru_lock);
517 continue;
518 }
519 }
520
521 spin_lock(&pagecache_lock);
522
523 /*
524 * This is the non-racy check for busy page.
525 * It is critical to check PageDirty _after_ we made sure
526 * the page is freeable so not in use by anybody.
527 * At this point we're guaranteed that page->buffers is NULL,
528 * nobody can refill page->buffers under us because we still
529 * hold the page lock.
530 */
531 if (!page->mapping || page_count(page) > 1) {
532 spin_unlock(&pagecache_lock);
533 UnlockPage(page);
534 page_mapped:
535 if (--max_mapped < 0) {
536 spin_unlock(&pagemap_lru_lock);
537
538 nr_pages -= kmem_cache_reap(gfp_mask);
539 if (nr_pages <= 0)
540 goto out;
541
542 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
543 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
544 #ifdef CONFIG_QUOTA
545 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
546 #endif
547
548 if (!*failed_swapout)
549 *failed_swapout = !swap_out(classzone);
550
551 max_mapped = nr_pages * vm_mapped_ratio;
552
553 spin_lock(&pagemap_lru_lock);
554 refill_inactive(nr_pages, classzone);
555 }
556 continue;
557
558 }
559 smp_rmb();
560 if (PageDirty(page)) {
561 spin_unlock(&pagecache_lock);
562 UnlockPage(page);
563 continue;
564 }
565
566 __lru_cache_del(page);
567
568 /* point of no return */
569 if (likely(!PageSwapCache(page))) {
570 __remove_inode_page(page);
571 spin_unlock(&pagecache_lock);
572 } else {
573 swp_entry_t swap;
574 swap.val = page->index;
575 __delete_from_swap_cache(page);
576 spin_unlock(&pagecache_lock);
577 swap_free(swap);
578 }
579
580 UnlockPage(page);
581
582 /* effectively free the page here */
583 page_cache_release(page);
584
585 if (--nr_pages)
586 continue;
587 break;
588 }
589 spin_unlock(&pagemap_lru_lock);
590
591 out:
592 return nr_pages;
593 }
594
595 /*
596 * This moves pages from the active list to
597 * the inactive list.
598 *
599 * We move them the other way when we see the
600 * reference bit on the page.
601 */
refill_inactive(int nr_pages,zone_t * classzone)602 static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
603 {
604 struct list_head * entry;
605 unsigned long ratio;
606
607 ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
608
609 entry = active_list.prev;
610 while (ratio && entry != &active_list) {
611 struct page * page;
612
613 page = list_entry(entry, struct page, lru);
614 entry = entry->prev;
615 if (PageTestandClearReferenced(page)) {
616 list_del(&page->lru);
617 list_add(&page->lru, &active_list);
618 continue;
619 }
620
621 ratio--;
622
623 del_page_from_active_list(page);
624 add_page_to_inactive_list(page);
625 SetPageReferenced(page);
626 }
627
628 if (entry != &active_list) {
629 list_del(&active_list);
630 list_add(&active_list, entry);
631 }
632 }
633
634 static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
shrink_caches(zone_t * classzone,unsigned int gfp_mask,int nr_pages,int * failed_swapout)635 static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
636 {
637 nr_pages -= kmem_cache_reap(gfp_mask);
638 if (nr_pages <= 0)
639 goto out;
640
641 spin_lock(&pagemap_lru_lock);
642 refill_inactive(nr_pages, classzone);
643
644 nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
645
646 out:
647 return nr_pages;
648 }
649
650 static int check_classzone_need_balance(zone_t * classzone);
651
try_to_free_pages_zone(zone_t * classzone,unsigned int gfp_mask)652 int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
653 {
654 gfp_mask = pf_gfp_mask(gfp_mask);
655
656 for (;;) {
657 int tries = vm_passes;
658 int failed_swapout = !(gfp_mask & __GFP_IO);
659 int nr_pages = SWAP_CLUSTER_MAX;
660
661 do {
662 nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
663 if (nr_pages <= 0)
664 return 1;
665 shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
666 shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
667 #ifdef CONFIG_QUOTA
668 shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
669 #endif
670 if (!failed_swapout)
671 failed_swapout = !swap_out(classzone);
672 } while (--tries);
673
674 #ifdef CONFIG_OOM_KILLER
675 out_of_memory();
676 #else
677 if (likely(current->pid != 1))
678 break;
679 if (!check_classzone_need_balance(classzone))
680 break;
681
682 __set_current_state(TASK_RUNNING);
683 yield();
684 #endif
685 }
686
687 return 0;
688 }
689
try_to_free_pages(unsigned int gfp_mask)690 int fastcall try_to_free_pages(unsigned int gfp_mask)
691 {
692 pg_data_t *pgdat;
693 zonelist_t *zonelist;
694 unsigned long pf_free_pages;
695 int error = 0;
696
697 pf_free_pages = current->flags & PF_FREE_PAGES;
698 current->flags &= ~PF_FREE_PAGES;
699
700 for_each_pgdat(pgdat) {
701 zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
702 error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
703 }
704
705 current->flags |= pf_free_pages;
706 return error;
707 }
708
709 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
710
check_classzone_need_balance(zone_t * classzone)711 static int check_classzone_need_balance(zone_t * classzone)
712 {
713 zone_t * first_zone;
714 int class_idx = zone_idx(classzone);
715
716 first_zone = classzone->zone_pgdat->node_zones;
717 while (classzone >= first_zone) {
718 if (classzone->free_pages > classzone->watermarks[class_idx].high)
719 return 0;
720 classzone--;
721 }
722 return 1;
723 }
724
kswapd_balance_pgdat(pg_data_t * pgdat)725 static int kswapd_balance_pgdat(pg_data_t * pgdat)
726 {
727 int need_more_balance = 0, i;
728 zone_t * zone;
729
730 for (i = pgdat->nr_zones-1; i >= 0; i--) {
731 zone = pgdat->node_zones + i;
732 if (unlikely(current->need_resched))
733 schedule();
734 if (!zone->need_balance || !zone->size)
735 continue;
736 if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
737 zone->need_balance = 0;
738 __set_current_state(TASK_INTERRUPTIBLE);
739 schedule_timeout(HZ*5);
740 continue;
741 }
742 if (check_classzone_need_balance(zone))
743 need_more_balance = 1;
744 else
745 zone->need_balance = 0;
746 }
747
748 return need_more_balance;
749 }
750
kswapd_balance(void)751 static void kswapd_balance(void)
752 {
753 int need_more_balance;
754 pg_data_t * pgdat;
755
756 do {
757 need_more_balance = 0;
758
759 for_each_pgdat(pgdat)
760 need_more_balance |= kswapd_balance_pgdat(pgdat);
761 } while (need_more_balance);
762 }
763
kswapd_can_sleep_pgdat(pg_data_t * pgdat)764 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
765 {
766 zone_t * zone;
767 int i;
768
769 for (i = pgdat->nr_zones-1; i >= 0; i--) {
770 zone = pgdat->node_zones + i;
771 if (!zone->need_balance || !zone->size)
772 continue;
773 return 0;
774 }
775
776 return 1;
777 }
778
kswapd_can_sleep(void)779 static int kswapd_can_sleep(void)
780 {
781 pg_data_t * pgdat;
782
783 for_each_pgdat(pgdat) {
784 if (!kswapd_can_sleep_pgdat(pgdat))
785 return 0;
786 }
787
788 return 1;
789 }
790
791 /*
792 * The background pageout daemon, started as a kernel thread
793 * from the init process.
794 *
795 * This basically trickles out pages so that we have _some_
796 * free memory available even if there is no other activity
797 * that frees anything up. This is needed for things like routing
798 * etc, where we otherwise might have all activity going on in
799 * asynchronous contexts that cannot page things out.
800 *
801 * If there are applications that are active memory-allocators
802 * (most normal use), this basically shouldn't matter.
803 */
kswapd(void * unused)804 int kswapd(void *unused)
805 {
806 struct task_struct *tsk = current;
807 DECLARE_WAITQUEUE(wait, tsk);
808
809 daemonize();
810 strcpy(tsk->comm, "kswapd");
811 sigfillset(&tsk->blocked);
812
813 /*
814 * Tell the memory management that we're a "memory allocator",
815 * and that if we need more memory we should get access to it
816 * regardless (see "__alloc_pages()"). "kswapd" should
817 * never get caught in the normal page freeing logic.
818 *
819 * (Kswapd normally doesn't need memory anyway, but sometimes
820 * you need a small amount of memory in order to be able to
821 * page out something else, and this flag essentially protects
822 * us from recursively trying to free more memory as we're
823 * trying to free the first piece of memory in the first place).
824 */
825 tsk->flags |= PF_MEMALLOC;
826
827 /*
828 * Kswapd main loop.
829 */
830 for (;;) {
831 __set_current_state(TASK_INTERRUPTIBLE);
832 add_wait_queue(&kswapd_wait, &wait);
833
834 mb();
835 if (kswapd_can_sleep())
836 schedule();
837
838 __set_current_state(TASK_RUNNING);
839 remove_wait_queue(&kswapd_wait, &wait);
840
841 /*
842 * If we actually get into a low-memory situation,
843 * the processes needing more memory will wake us
844 * up on a more timely basis.
845 */
846 kswapd_balance();
847 run_task_queue(&tq_disk);
848 }
849 }
850
kswapd_init(void)851 static int __init kswapd_init(void)
852 {
853 printk("Starting kswapd\n");
854 swap_setup();
855 kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
856 return 0;
857 }
858
859 module_init(kswapd_init)
860