1 /*
2  *  linux/mm/vmscan.c
3  *
4  *  The pageout daemon, decides which pages to evict (swap out) and
5  *  does the actual work of freeing them.
6  *
7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8  *
9  *  Swap reorganised 29.12.95, Stephen Tweedie.
10  *  kswapd added: 7.1.96  sct
11  *  Removed kswapd_ctl limits, and swap out as many pages as needed
12  *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
13  *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
14  *  Multiqueue VM started 5.8.00, Rik van Riel.
15  */
16 
17 #include <linux/slab.h>
18 #include <linux/kernel_stat.h>
19 #include <linux/swap.h>
20 #include <linux/swapctl.h>
21 #include <linux/smp_lock.h>
22 #include <linux/pagemap.h>
23 #include <linux/init.h>
24 #include <linux/highmem.h>
25 #include <linux/file.h>
26 
27 #include <asm/pgalloc.h>
28 
29 /*
30  * "vm_passes" is the number of vm passes before failing the
31  * memory balancing. Take into account 3 passes are needed
32  * for a flush/wait/free cycle and that we only scan 1/vm_cache_scan_ratio
33  * of the inactive list at each pass.
34  */
35 int vm_passes = 60;
36 
37 /*
38  * "vm_cache_scan_ratio" is how much of the inactive LRU queue we will scan
39  * in one go. A value of 6 for vm_cache_scan_ratio implies that we'll
40  * scan 1/6 of the inactive lists during a normal aging round.
41  */
42 int vm_cache_scan_ratio = 6;
43 
44 /*
45  * "vm_mapped_ratio" controls the pageout rate, the smaller, the earlier
46  * we'll start to pageout.
47  */
48 int vm_mapped_ratio = 100;
49 
50 /*
51  * "vm_lru_balance_ratio" controls the balance between active and
52  * inactive cache. The bigger vm_balance is, the easier the
53  * active cache will grow, because we'll rotate the active list
54  * slowly. A value of 2 means we'll go towards a balance of
55  * 1/3 of the cache being inactive.
56  */
57 int vm_lru_balance_ratio = 2;
58 
59 /*
60  * "vm_vfs_scan_ratio" is what proportion of the VFS queues we will scan
61  * in one go. A value of 6 for vm_vfs_scan_ratio implies that 1/6th of
62  * the unused-inode, dentry and dquot caches will be freed during a normal
63  * aging round.
64  */
65 int vm_vfs_scan_ratio = 6;
66 
67 /*
68  * "vm_anon_lru" select if to immdiatly insert anon pages in the
69  * lru. Immediatly means as soon as they're allocated during the
70  * page faults.
71  *
72  * If this is set to 0, they're inserted only after the first
73  * swapout.
74  *
75  * Having anon pages immediatly inserted in the lru allows the
76  * VM to know better when it's worthwhile to start swapping
77  * anonymous ram, it will start to swap earlier and it should
78  * swap smoother and faster, but it will decrease scalability
79  * on the >16-ways of an order of magnitude. Big SMP/NUMA
80  * definitely can't take an hit on a global spinlock at
81  * every anon page allocation. So this is off by default.
82  *
83  * Low ram machines that swaps all the time want to turn
84  * this on (i.e. set to 1).
85  */
86 int vm_anon_lru = 0;
87 
88 /*
89  * The swap-out function returns 1 if it successfully
90  * scanned all the pages it was asked to (`count').
91  * It returns zero if it couldn't do anything,
92  *
93  * rss may decrease because pages are shared, but this
94  * doesn't count as having freed a page.
95  */
96 
97 /* mm->page_table_lock is held. mmap_sem is not held */
try_to_swap_out(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long address,pte_t * page_table,struct page * page,zone_t * classzone)98 static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct* vma, unsigned long address, pte_t * page_table, struct page *page, zone_t * classzone)
99 {
100 	pte_t pte;
101 	swp_entry_t entry;
102 
103 	/* Don't look at this pte if it's been accessed recently. */
104 	if ((vma->vm_flags & VM_LOCKED) || ptep_test_and_clear_young(page_table)) {
105 		mark_page_accessed(page);
106 		return 0;
107 	}
108 
109 	/* Don't bother unmapping pages that are active */
110 	if (PageActive(page))
111 		return 0;
112 
113 	/* Don't bother replenishing zones not under pressure.. */
114 	if (!memclass(page_zone(page), classzone))
115 		return 0;
116 
117 	if (TryLockPage(page))
118 		return 0;
119 
120 	/* From this point on, the odds are that we're going to
121 	 * nuke this pte, so read and clear the pte.  This hook
122 	 * is needed on CPUs which update the accessed and dirty
123 	 * bits in hardware.
124 	 */
125 	flush_cache_page(vma, address);
126 	pte = ptep_get_and_clear(page_table);
127 	flush_tlb_page(vma, address);
128 
129 	if (pte_dirty(pte))
130 		set_page_dirty(page);
131 
132 	/*
133 	 * Is the page already in the swap cache? If so, then
134 	 * we can just drop our reference to it without doing
135 	 * any IO - it's already up-to-date on disk.
136 	 */
137 	if (PageSwapCache(page)) {
138 		entry.val = page->index;
139 		swap_duplicate(entry);
140 set_swap_pte:
141 		set_pte(page_table, swp_entry_to_pte(entry));
142 drop_pte:
143 		mm->rss--;
144 		UnlockPage(page);
145 		{
146 			int freeable = page_count(page) - !!page->buffers <= 2;
147 			page_cache_release(page);
148 			return freeable;
149 		}
150 	}
151 
152 	/*
153 	 * Is it a clean page? Then it must be recoverable
154 	 * by just paging it in again, and we can just drop
155 	 * it..  or if it's dirty but has backing store,
156 	 * just mark the page dirty and drop it.
157 	 *
158 	 * However, this won't actually free any real
159 	 * memory, as the page will just be in the page cache
160 	 * somewhere, and as such we should just continue
161 	 * our scan.
162 	 *
163 	 * Basically, this just makes it possible for us to do
164 	 * some real work in the future in "refill_inactive()".
165 	 */
166 	if (page->mapping)
167 		goto drop_pte;
168 	if (!PageDirty(page))
169 		goto drop_pte;
170 
171 	/*
172 	 * Anonymous buffercache pages can be left behind by
173 	 * concurrent truncate and pagefault.
174 	 */
175 	if (page->buffers)
176 		goto preserve;
177 
178 	/*
179 	 * This is a dirty, swappable page.  First of all,
180 	 * get a suitable swap entry for it, and make sure
181 	 * we have the swap cache set up to associate the
182 	 * page with that swap entry.
183 	 */
184 	for (;;) {
185 		entry = get_swap_page();
186 		if (!entry.val)
187 			break;
188 		/* Add it to the swap cache and mark it dirty
189 		 * (adding to the page cache will clear the dirty
190 		 * and uptodate bits, so we need to do it again)
191 		 */
192 		if (add_to_swap_cache(page, entry) == 0) {
193 			SetPageUptodate(page);
194 			set_page_dirty(page);
195 			goto set_swap_pte;
196 		}
197 		/* Raced with "speculative" read_swap_cache_async */
198 		swap_free(entry);
199 	}
200 
201 	/* No swap space left */
202 preserve:
203 	set_pte(page_table, pte);
204 	UnlockPage(page);
205 	return 0;
206 }
207 
208 /* mm->page_table_lock is held. mmap_sem is not held */
swap_out_pmd(struct mm_struct * mm,struct vm_area_struct * vma,pmd_t * dir,unsigned long address,unsigned long end,int count,zone_t * classzone)209 static inline int swap_out_pmd(struct mm_struct * mm, struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
210 {
211 	pte_t * pte;
212 	unsigned long pmd_end;
213 
214 	if (pmd_none(*dir))
215 		return count;
216 	if (pmd_bad(*dir)) {
217 		pmd_ERROR(*dir);
218 		pmd_clear(dir);
219 		return count;
220 	}
221 
222 	pte = pte_offset(dir, address);
223 
224 	pmd_end = (address + PMD_SIZE) & PMD_MASK;
225 	if (end > pmd_end)
226 		end = pmd_end;
227 
228 	do {
229 		if (pte_present(*pte)) {
230 			struct page *page = pte_page(*pte);
231 
232 			if (VALID_PAGE(page) && !PageReserved(page)) {
233 				count -= try_to_swap_out(mm, vma, address, pte, page, classzone);
234 				if (!count) {
235 					address += PAGE_SIZE;
236 					break;
237 				}
238 			}
239 		}
240 		address += PAGE_SIZE;
241 		pte++;
242 	} while (address && (address < end));
243 	mm->swap_address = address;
244 	return count;
245 }
246 
247 /* mm->page_table_lock is held. mmap_sem is not held */
swap_out_pgd(struct mm_struct * mm,struct vm_area_struct * vma,pgd_t * dir,unsigned long address,unsigned long end,int count,zone_t * classzone)248 static inline int swap_out_pgd(struct mm_struct * mm, struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long end, int count, zone_t * classzone)
249 {
250 	pmd_t * pmd;
251 	unsigned long pgd_end;
252 
253 	if (pgd_none(*dir))
254 		return count;
255 	if (pgd_bad(*dir)) {
256 		pgd_ERROR(*dir);
257 		pgd_clear(dir);
258 		return count;
259 	}
260 
261 	pmd = pmd_offset(dir, address);
262 
263 	pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
264 	if (pgd_end && (end > pgd_end))
265 		end = pgd_end;
266 
267 	do {
268 		count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone);
269 		if (!count)
270 			break;
271 		address = (address + PMD_SIZE) & PMD_MASK;
272 		pmd++;
273 	} while (address && (address < end));
274 	return count;
275 }
276 
277 /* mm->page_table_lock is held. mmap_sem is not held */
swap_out_vma(struct mm_struct * mm,struct vm_area_struct * vma,unsigned long address,int count,zone_t * classzone)278 static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long address, int count, zone_t * classzone)
279 {
280 	pgd_t *pgdir;
281 	unsigned long end;
282 
283 	/* Don't swap out areas which are reserved */
284 	if (vma->vm_flags & VM_RESERVED)
285 		return count;
286 
287 	pgdir = pgd_offset(mm, address);
288 
289 	end = vma->vm_end;
290 	BUG_ON(address >= end);
291 	do {
292 		count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone);
293 		if (!count)
294 			break;
295 		address = (address + PGDIR_SIZE) & PGDIR_MASK;
296 		pgdir++;
297 	} while (address && (address < end));
298 	return count;
299 }
300 
301 /* Placeholder for swap_out(): may be updated by fork.c:mmput() */
302 struct mm_struct *swap_mm = &init_mm;
303 
304 /*
305  * Returns remaining count of pages to be swapped out by followup call.
306  */
swap_out_mm(struct mm_struct * mm,int count,int * mmcounter,zone_t * classzone)307 static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
308 {
309 	unsigned long address;
310 	struct vm_area_struct* vma;
311 
312 	/*
313 	 * Find the proper vm-area after freezing the vma chain
314 	 * and ptes.
315 	 */
316 	spin_lock(&mm->page_table_lock);
317 	address = mm->swap_address;
318 	if (address == TASK_SIZE || swap_mm != mm) {
319 		/* We raced: don't count this mm but try again */
320 		++*mmcounter;
321 		goto out_unlock;
322 	}
323 	vma = find_vma(mm, address);
324 	if (vma) {
325 		if (address < vma->vm_start)
326 			address = vma->vm_start;
327 
328 		for (;;) {
329 			count = swap_out_vma(mm, vma, address, count, classzone);
330 			vma = vma->vm_next;
331 			if (!vma)
332 				break;
333 			if (!count)
334 				goto out_unlock;
335 			address = vma->vm_start;
336 		}
337 	}
338 	/* Indicate that we reached the end of address space */
339 	mm->swap_address = TASK_SIZE;
340 
341 out_unlock:
342 	spin_unlock(&mm->page_table_lock);
343 	return count;
344 }
345 
346 static int FASTCALL(swap_out(zone_t * classzone));
swap_out(zone_t * classzone)347 static int fastcall swap_out(zone_t * classzone)
348 {
349 	int counter, nr_pages = SWAP_CLUSTER_MAX;
350 	struct mm_struct *mm;
351 
352 	counter = mmlist_nr << 1;
353 	do {
354 		if (unlikely(current->need_resched)) {
355 			__set_current_state(TASK_RUNNING);
356 			schedule();
357 		}
358 
359 		spin_lock(&mmlist_lock);
360 		mm = swap_mm;
361 		while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
362 			mm->swap_address = 0;
363 			mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
364 			if (mm == swap_mm)
365 				goto empty;
366 			swap_mm = mm;
367 		}
368 
369 		/* Make sure the mm doesn't disappear when we drop the lock.. */
370 		atomic_inc(&mm->mm_users);
371 		spin_unlock(&mmlist_lock);
372 
373 		nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
374 
375 		mmput(mm);
376 
377 		if (!nr_pages)
378 			return 1;
379 	} while (--counter >= 0);
380 
381 	return 0;
382 
383 empty:
384 	spin_unlock(&mmlist_lock);
385 	return 0;
386 }
387 
388 static void FASTCALL(refill_inactive(int nr_pages, zone_t * classzone));
389 static int FASTCALL(shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout));
shrink_cache(int nr_pages,zone_t * classzone,unsigned int gfp_mask,int * failed_swapout)390 static int fastcall shrink_cache(int nr_pages, zone_t * classzone, unsigned int gfp_mask, int * failed_swapout)
391 {
392 	struct list_head * entry;
393 	int max_scan = (classzone->nr_inactive_pages + classzone->nr_active_pages) / vm_cache_scan_ratio;
394 	int max_mapped = vm_mapped_ratio * nr_pages;
395 
396 	while (max_scan && classzone->nr_inactive_pages && (entry = inactive_list.prev) != &inactive_list) {
397 		struct page * page;
398 
399 		if (unlikely(current->need_resched)) {
400 			spin_unlock(&pagemap_lru_lock);
401 			__set_current_state(TASK_RUNNING);
402 			schedule();
403 			spin_lock(&pagemap_lru_lock);
404 			continue;
405 		}
406 
407 		page = list_entry(entry, struct page, lru);
408 
409 		BUG_ON(!PageLRU(page));
410 		BUG_ON(PageActive(page));
411 
412 		list_del(entry);
413 		list_add(entry, &inactive_list);
414 
415 		/*
416 		 * Zero page counts can happen because we unlink the pages
417 		 * _after_ decrementing the usage count..
418 		 */
419 		if (unlikely(!page_count(page)))
420 			continue;
421 
422 		if (!memclass(page_zone(page), classzone))
423 			continue;
424 
425 		max_scan--;
426 
427 		/* Racy check to avoid trylocking when not worthwhile */
428 		if (!page->buffers && (page_count(page) != 1 || !page->mapping))
429 			goto page_mapped;
430 
431 		/*
432 		 * The page is locked. IO in progress?
433 		 * Move it to the back of the list.
434 		 */
435 		if (unlikely(TryLockPage(page))) {
436 			if (PageLaunder(page) && (gfp_mask & __GFP_FS)) {
437 				page_cache_get(page);
438 				spin_unlock(&pagemap_lru_lock);
439 				wait_on_page(page);
440 				page_cache_release(page);
441 				spin_lock(&pagemap_lru_lock);
442 			}
443 			continue;
444 		}
445 
446 		if (PageDirty(page) && is_page_cache_freeable(page) && page->mapping) {
447 			/*
448 			 * It is not critical here to write it only if
449 			 * the page is unmapped beause any direct writer
450 			 * like O_DIRECT would set the PG_dirty bitflag
451 			 * on the phisical page after having successfully
452 			 * pinned it and after the I/O to the page is finished,
453 			 * so the direct writes to the page cannot get lost.
454 			 */
455 			int (*writepage)(struct page *);
456 
457 			writepage = page->mapping->a_ops->writepage;
458 			if ((gfp_mask & __GFP_FS) && writepage) {
459 				ClearPageDirty(page);
460 				SetPageLaunder(page);
461 				page_cache_get(page);
462 				spin_unlock(&pagemap_lru_lock);
463 
464 				writepage(page);
465 				page_cache_release(page);
466 
467 				spin_lock(&pagemap_lru_lock);
468 				continue;
469 			}
470 		}
471 
472 		/*
473 		 * If the page has buffers, try to free the buffer mappings
474 		 * associated with this page. If we succeed we try to free
475 		 * the page as well.
476 		 */
477 		if (page->buffers) {
478 			spin_unlock(&pagemap_lru_lock);
479 
480 			/* avoid to free a locked page */
481 			page_cache_get(page);
482 
483 			if (try_to_release_page(page, gfp_mask)) {
484 				if (!page->mapping) {
485 					/*
486 					 * We must not allow an anon page
487 					 * with no buffers to be visible on
488 					 * the LRU, so we unlock the page after
489 					 * taking the lru lock
490 					 */
491 					spin_lock(&pagemap_lru_lock);
492 					UnlockPage(page);
493 					__lru_cache_del(page);
494 
495 					/* effectively free the page here */
496 					page_cache_release(page);
497 
498 					if (--nr_pages)
499 						continue;
500 					break;
501 				} else {
502 					/*
503 					 * The page is still in pagecache so undo the stuff
504 					 * before the try_to_release_page since we've not
505 					 * finished and we can now try the next step.
506 					 */
507 					page_cache_release(page);
508 
509 					spin_lock(&pagemap_lru_lock);
510 				}
511 			} else {
512 				/* failed to drop the buffers so stop here */
513 				UnlockPage(page);
514 				page_cache_release(page);
515 
516 				spin_lock(&pagemap_lru_lock);
517 				continue;
518 			}
519 		}
520 
521 		spin_lock(&pagecache_lock);
522 
523 		/*
524 		 * This is the non-racy check for busy page.
525 		 * It is critical to check PageDirty _after_ we made sure
526 		 * the page is freeable so not in use by anybody.
527 		 * At this point we're guaranteed that page->buffers is NULL,
528 		 * nobody can refill page->buffers under us because we still
529 		 * hold the page lock.
530 		 */
531 		if (!page->mapping || page_count(page) > 1) {
532 			spin_unlock(&pagecache_lock);
533 			UnlockPage(page);
534 page_mapped:
535 			if (--max_mapped < 0) {
536 				spin_unlock(&pagemap_lru_lock);
537 
538 				nr_pages -= kmem_cache_reap(gfp_mask);
539 				if (nr_pages <= 0)
540 					goto out;
541 
542 				shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
543 				shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
544 #ifdef CONFIG_QUOTA
545 				shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
546 #endif
547 
548 				if (!*failed_swapout)
549 					*failed_swapout = !swap_out(classzone);
550 
551 				max_mapped = nr_pages * vm_mapped_ratio;
552 
553 				spin_lock(&pagemap_lru_lock);
554 				refill_inactive(nr_pages, classzone);
555 			}
556 			continue;
557 
558 		}
559 		smp_rmb();
560 		if (PageDirty(page)) {
561 			spin_unlock(&pagecache_lock);
562 			UnlockPage(page);
563 			continue;
564 		}
565 
566 		__lru_cache_del(page);
567 
568 		/* point of no return */
569 		if (likely(!PageSwapCache(page))) {
570 			__remove_inode_page(page);
571 			spin_unlock(&pagecache_lock);
572 		} else {
573 			swp_entry_t swap;
574 			swap.val = page->index;
575 			__delete_from_swap_cache(page);
576 			spin_unlock(&pagecache_lock);
577 			swap_free(swap);
578 		}
579 
580 		UnlockPage(page);
581 
582 		/* effectively free the page here */
583 		page_cache_release(page);
584 
585 		if (--nr_pages)
586 			continue;
587 		break;
588 	}
589 	spin_unlock(&pagemap_lru_lock);
590 
591  out:
592 	return nr_pages;
593 }
594 
595 /*
596  * This moves pages from the active list to
597  * the inactive list.
598  *
599  * We move them the other way when we see the
600  * reference bit on the page.
601  */
refill_inactive(int nr_pages,zone_t * classzone)602 static void fastcall refill_inactive(int nr_pages, zone_t * classzone)
603 {
604 	struct list_head * entry;
605 	unsigned long ratio;
606 
607 	ratio = (unsigned long) nr_pages * classzone->nr_active_pages / (((unsigned long) classzone->nr_inactive_pages * vm_lru_balance_ratio) + 1);
608 
609 	entry = active_list.prev;
610 	while (ratio && entry != &active_list) {
611 		struct page * page;
612 
613 		page = list_entry(entry, struct page, lru);
614 		entry = entry->prev;
615 		if (PageTestandClearReferenced(page)) {
616 			list_del(&page->lru);
617 			list_add(&page->lru, &active_list);
618 			continue;
619 		}
620 
621 		ratio--;
622 
623 		del_page_from_active_list(page);
624 		add_page_to_inactive_list(page);
625 		SetPageReferenced(page);
626 	}
627 
628 	if (entry != &active_list) {
629 		list_del(&active_list);
630 		list_add(&active_list, entry);
631 	}
632 }
633 
634 static int FASTCALL(shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout));
shrink_caches(zone_t * classzone,unsigned int gfp_mask,int nr_pages,int * failed_swapout)635 static int fastcall shrink_caches(zone_t * classzone, unsigned int gfp_mask, int nr_pages, int * failed_swapout)
636 {
637 	nr_pages -= kmem_cache_reap(gfp_mask);
638 	if (nr_pages <= 0)
639 		goto out;
640 
641 	spin_lock(&pagemap_lru_lock);
642 	refill_inactive(nr_pages, classzone);
643 
644 	nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, failed_swapout);
645 
646 out:
647         return nr_pages;
648 }
649 
650 static int check_classzone_need_balance(zone_t * classzone);
651 
try_to_free_pages_zone(zone_t * classzone,unsigned int gfp_mask)652 int fastcall try_to_free_pages_zone(zone_t *classzone, unsigned int gfp_mask)
653 {
654 	gfp_mask = pf_gfp_mask(gfp_mask);
655 
656 	for (;;) {
657 		int tries = vm_passes;
658 		int failed_swapout = !(gfp_mask & __GFP_IO);
659 		int nr_pages = SWAP_CLUSTER_MAX;
660 
661 		do {
662 			nr_pages = shrink_caches(classzone, gfp_mask, nr_pages, &failed_swapout);
663 			if (nr_pages <= 0)
664 				return 1;
665 			shrink_dcache_memory(vm_vfs_scan_ratio, gfp_mask);
666 			shrink_icache_memory(vm_vfs_scan_ratio, gfp_mask);
667 #ifdef CONFIG_QUOTA
668 			shrink_dqcache_memory(vm_vfs_scan_ratio, gfp_mask);
669 #endif
670 			if (!failed_swapout)
671 				failed_swapout = !swap_out(classzone);
672 		} while (--tries);
673 
674 #ifdef	CONFIG_OOM_KILLER
675 	out_of_memory();
676 #else
677 	if (likely(current->pid != 1))
678 		break;
679 	if (!check_classzone_need_balance(classzone))
680 		break;
681 
682 	__set_current_state(TASK_RUNNING);
683 	yield();
684 #endif
685 	}
686 
687 	return 0;
688 }
689 
try_to_free_pages(unsigned int gfp_mask)690 int fastcall try_to_free_pages(unsigned int gfp_mask)
691 {
692 	pg_data_t *pgdat;
693 	zonelist_t *zonelist;
694 	unsigned long pf_free_pages;
695 	int error = 0;
696 
697 	pf_free_pages = current->flags & PF_FREE_PAGES;
698 	current->flags &= ~PF_FREE_PAGES;
699 
700 	for_each_pgdat(pgdat) {
701 		zonelist = pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK);
702 		error |= try_to_free_pages_zone(zonelist->zones[0], gfp_mask);
703 	}
704 
705 	current->flags |= pf_free_pages;
706 	return error;
707 }
708 
709 DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
710 
check_classzone_need_balance(zone_t * classzone)711 static int check_classzone_need_balance(zone_t * classzone)
712 {
713 	zone_t * first_zone;
714 	int class_idx = zone_idx(classzone);
715 
716 	first_zone = classzone->zone_pgdat->node_zones;
717 	while (classzone >= first_zone) {
718 		if (classzone->free_pages > classzone->watermarks[class_idx].high)
719 			return 0;
720 		classzone--;
721 	}
722 	return 1;
723 }
724 
kswapd_balance_pgdat(pg_data_t * pgdat)725 static int kswapd_balance_pgdat(pg_data_t * pgdat)
726 {
727 	int need_more_balance = 0, i;
728 	zone_t * zone;
729 
730 	for (i = pgdat->nr_zones-1; i >= 0; i--) {
731 		zone = pgdat->node_zones + i;
732 		if (unlikely(current->need_resched))
733 			schedule();
734 		if (!zone->need_balance || !zone->size)
735 			continue;
736 		if (!try_to_free_pages_zone(zone, GFP_KSWAPD)) {
737 			zone->need_balance = 0;
738 			__set_current_state(TASK_INTERRUPTIBLE);
739 			schedule_timeout(HZ*5);
740 			continue;
741 		}
742 		if (check_classzone_need_balance(zone))
743 			need_more_balance = 1;
744 		else
745 			zone->need_balance = 0;
746 	}
747 
748 	return need_more_balance;
749 }
750 
kswapd_balance(void)751 static void kswapd_balance(void)
752 {
753 	int need_more_balance;
754 	pg_data_t * pgdat;
755 
756 	do {
757 		need_more_balance = 0;
758 
759 		for_each_pgdat(pgdat)
760 			need_more_balance |= kswapd_balance_pgdat(pgdat);
761 	} while (need_more_balance);
762 }
763 
kswapd_can_sleep_pgdat(pg_data_t * pgdat)764 static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
765 {
766 	zone_t * zone;
767 	int i;
768 
769 	for (i = pgdat->nr_zones-1; i >= 0; i--) {
770 		zone = pgdat->node_zones + i;
771 		if (!zone->need_balance || !zone->size)
772 			continue;
773 		return 0;
774 	}
775 
776 	return 1;
777 }
778 
kswapd_can_sleep(void)779 static int kswapd_can_sleep(void)
780 {
781 	pg_data_t * pgdat;
782 
783 	for_each_pgdat(pgdat) {
784 		if (!kswapd_can_sleep_pgdat(pgdat))
785 			return 0;
786 	}
787 
788 	return 1;
789 }
790 
791 /*
792  * The background pageout daemon, started as a kernel thread
793  * from the init process.
794  *
795  * This basically trickles out pages so that we have _some_
796  * free memory available even if there is no other activity
797  * that frees anything up. This is needed for things like routing
798  * etc, where we otherwise might have all activity going on in
799  * asynchronous contexts that cannot page things out.
800  *
801  * If there are applications that are active memory-allocators
802  * (most normal use), this basically shouldn't matter.
803  */
kswapd(void * unused)804 int kswapd(void *unused)
805 {
806 	struct task_struct *tsk = current;
807 	DECLARE_WAITQUEUE(wait, tsk);
808 
809 	daemonize();
810 	strcpy(tsk->comm, "kswapd");
811 	sigfillset(&tsk->blocked);
812 
813 	/*
814 	 * Tell the memory management that we're a "memory allocator",
815 	 * and that if we need more memory we should get access to it
816 	 * regardless (see "__alloc_pages()"). "kswapd" should
817 	 * never get caught in the normal page freeing logic.
818 	 *
819 	 * (Kswapd normally doesn't need memory anyway, but sometimes
820 	 * you need a small amount of memory in order to be able to
821 	 * page out something else, and this flag essentially protects
822 	 * us from recursively trying to free more memory as we're
823 	 * trying to free the first piece of memory in the first place).
824 	 */
825 	tsk->flags |= PF_MEMALLOC;
826 
827 	/*
828 	 * Kswapd main loop.
829 	 */
830 	for (;;) {
831 		__set_current_state(TASK_INTERRUPTIBLE);
832 		add_wait_queue(&kswapd_wait, &wait);
833 
834 		mb();
835 		if (kswapd_can_sleep())
836 			schedule();
837 
838 		__set_current_state(TASK_RUNNING);
839 		remove_wait_queue(&kswapd_wait, &wait);
840 
841 		/*
842 		 * If we actually get into a low-memory situation,
843 		 * the processes needing more memory will wake us
844 		 * up on a more timely basis.
845 		 */
846 		kswapd_balance();
847 		run_task_queue(&tq_disk);
848 	}
849 }
850 
kswapd_init(void)851 static int __init kswapd_init(void)
852 {
853 	printk("Starting kswapd\n");
854 	swap_setup();
855 	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
856 	return 0;
857 }
858 
859 module_init(kswapd_init)
860