1 /*
2  *  linux/mm/page_alloc.c
3  *
4  *  Manages the free list, the system allocates free pages here.
5  *  Note that kmalloc() lives in slab.c
6  *
7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8  *  Swap reorganised 29.12.95, Stephen Tweedie
9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13  */
14 
15 #include <linux/config.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/interrupt.h>
20 #include <linux/pagemap.h>
21 #include <linux/bootmem.h>
22 #include <linux/slab.h>
23 #include <linux/module.h>
24 
25 int nr_swap_pages;
26 int nr_active_pages;
27 int nr_inactive_pages;
28 LIST_HEAD(inactive_list);
29 LIST_HEAD(active_list);
30 pg_data_t *pgdat_list;
31 
32 /*
33  *
34  * The zone_table array is used to look up the address of the
35  * struct zone corresponding to a given zone number (ZONE_DMA,
36  * ZONE_NORMAL, or ZONE_HIGHMEM).
37  */
38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
39 EXPORT_SYMBOL(zone_table);
40 
41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
45 static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
46 
47 int vm_gfp_debug = 0;
48 
49 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
50 
51 static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED;
52 struct page * free_pages_ok_no_irq_head;
53 
do_free_pages_ok_no_irq(void * arg)54 static void do_free_pages_ok_no_irq(void * arg)
55 {
56        struct page * page, * __page;
57 
58        spin_lock_irq(&free_pages_ok_no_irq_lock);
59 
60        page = free_pages_ok_no_irq_head;
61        free_pages_ok_no_irq_head = NULL;
62 
63        spin_unlock_irq(&free_pages_ok_no_irq_lock);
64 
65        while (page) {
66                __page = page;
67                page = page->next_hash;
68                __free_pages_ok(__page, __page->index);
69        }
70 }
71 
72 static struct tq_struct free_pages_ok_no_irq_task = {
73        .routine        = do_free_pages_ok_no_irq,
74 };
75 
76 
77 /*
78  * Temporary debugging check.
79  */
80 #define BAD_RANGE(zone, page)						\
81 (									\
82 	(((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size))	\
83 	|| (((page) - mem_map) < (zone)->zone_start_mapnr)		\
84 	|| ((zone) != page_zone(page))					\
85 )
86 
87 /*
88  * Freeing function for a buddy system allocator.
89  * Contrary to prior comments, this is *NOT* hairy, and there
90  * is no reason for anyone not to understand it.
91  *
92  * The concept of a buddy system is to maintain direct-mapped tables
93  * (containing bit values) for memory blocks of various "orders".
94  * The bottom level table contains the map for the smallest allocatable
95  * units of memory (here, pages), and each level above it describes
96  * pairs of units from the levels below, hence, "buddies".
97  * At a high level, all that happens here is marking the table entry
98  * at the bottom level available, and propagating the changes upward
99  * as necessary, plus some accounting needed to play nicely with other
100  * parts of the VM system.
101  * At each level, we keep one bit for each pair of blocks, which
102  * is set to 1 iff only one of the pair is allocated.  So when we
103  * are allocating or freeing one, we can derive the state of the
104  * other.  That is, if we allocate a small block, and both were
105  * free, the remainder of the region must be split into blocks.
106  * If a block is freed, and its buddy is also free, then this
107  * triggers coalescing into a block of larger size.
108  *
109  * -- wli
110  */
111 
__free_pages_ok(struct page * page,unsigned int order)112 static void fastcall __free_pages_ok (struct page *page, unsigned int order)
113 {
114 	unsigned long index, page_idx, mask, flags;
115 	free_area_t *area;
116 	struct page *base;
117 	zone_t *zone;
118 
119 	/*
120 	 * Yes, think what happens when other parts of the kernel take
121 	 * a reference to a page in order to pin it for io. -ben
122 	 */
123 	if (PageLRU(page)) {
124 		if (unlikely(in_interrupt())) {
125 			unsigned long flags;
126 
127 			spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags);
128 			page->next_hash = free_pages_ok_no_irq_head;
129 			free_pages_ok_no_irq_head = page;
130 			page->index = order;
131 
132 			spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags);
133 
134 			schedule_task(&free_pages_ok_no_irq_task);
135 			return;
136 		}
137 
138 		lru_cache_del(page);
139 	}
140 
141 	if (page->buffers)
142 		BUG();
143 	if (page->mapping)
144 		BUG();
145 	if (!VALID_PAGE(page))
146 		BUG();
147 	if (PageLocked(page))
148 		BUG();
149 	if (PageActive(page))
150 		BUG();
151 	ClearPageReferenced(page);
152 	ClearPageDirty(page);
153 
154 	if (current->flags & PF_FREE_PAGES)
155 		goto local_freelist;
156  back_local_freelist:
157 
158 	zone = page_zone(page);
159 
160 	mask = (~0UL) << order;
161 	base = zone->zone_mem_map;
162 	page_idx = page - base;
163 	if (page_idx & ~mask)
164 		BUG();
165 	index = page_idx >> (1 + order);
166 
167 	area = zone->free_area + order;
168 
169 	spin_lock_irqsave(&zone->lock, flags);
170 
171 	zone->free_pages -= mask;
172 
173 	while (mask + (1 << (MAX_ORDER-1))) {
174 		struct page *buddy1, *buddy2;
175 
176 		if (area >= zone->free_area + MAX_ORDER)
177 			BUG();
178 		if (!__test_and_change_bit(index, area->map))
179 			/*
180 			 * the buddy page is still allocated.
181 			 */
182 			break;
183 		/*
184 		 * Move the buddy up one level.
185 		 * This code is taking advantage of the identity:
186 		 * 	-mask = 1+~mask
187 		 */
188 		buddy1 = base + (page_idx ^ -mask);
189 		buddy2 = base + page_idx;
190 		if (BAD_RANGE(zone,buddy1))
191 			BUG();
192 		if (BAD_RANGE(zone,buddy2))
193 			BUG();
194 
195 		list_del(&buddy1->list);
196 		mask <<= 1;
197 		area++;
198 		index >>= 1;
199 		page_idx &= mask;
200 	}
201 	list_add(&(base + page_idx)->list, &area->free_list);
202 
203 	spin_unlock_irqrestore(&zone->lock, flags);
204 	return;
205 
206  local_freelist:
207 	if (current->nr_local_pages)
208 		goto back_local_freelist;
209 	if (in_interrupt())
210 		goto back_local_freelist;
211 
212 	list_add(&page->list, &current->local_pages);
213 	page->index = order;
214 	current->nr_local_pages++;
215 }
216 
217 #define MARK_USED(index, order, area) \
218 	__change_bit((index) >> (1+(order)), (area)->map)
219 
expand(zone_t * zone,struct page * page,unsigned long index,int low,int high,free_area_t * area)220 static inline struct page * expand (zone_t *zone, struct page *page,
221 	 unsigned long index, int low, int high, free_area_t * area)
222 {
223 	unsigned long size = 1 << high;
224 
225 	while (high > low) {
226 		if (BAD_RANGE(zone,page))
227 			BUG();
228 		area--;
229 		high--;
230 		size >>= 1;
231 		list_add(&(page)->list, &(area)->free_list);
232 		MARK_USED(index, high, area);
233 		index += size;
234 		page += size;
235 	}
236 	if (BAD_RANGE(zone,page))
237 		BUG();
238 	return page;
239 }
240 
241 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
rmqueue(zone_t * zone,unsigned int order)242 static struct page * fastcall rmqueue(zone_t *zone, unsigned int order)
243 {
244 	free_area_t * area = zone->free_area + order;
245 	unsigned int curr_order = order;
246 	struct list_head *head, *curr;
247 	unsigned long flags;
248 	struct page *page;
249 
250 	spin_lock_irqsave(&zone->lock, flags);
251 	do {
252 		head = &area->free_list;
253 		curr = head->next;
254 
255 		if (curr != head) {
256 			unsigned int index;
257 
258 			page = list_entry(curr, struct page, list);
259 			if (BAD_RANGE(zone,page))
260 				BUG();
261 			list_del(curr);
262 			index = page - zone->zone_mem_map;
263 			if (curr_order != MAX_ORDER-1)
264 				MARK_USED(index, curr_order, area);
265 			zone->free_pages -= 1UL << order;
266 
267 			page = expand(zone, page, index, order, curr_order, area);
268 			spin_unlock_irqrestore(&zone->lock, flags);
269 
270 			set_page_count(page, 1);
271 			if (BAD_RANGE(zone,page))
272 				BUG();
273 			if (PageLRU(page))
274 				BUG();
275 			if (PageActive(page))
276 				BUG();
277 			return page;
278 		}
279 		curr_order++;
280 		area++;
281 	} while (curr_order < MAX_ORDER);
282 	spin_unlock_irqrestore(&zone->lock, flags);
283 
284 	return NULL;
285 }
286 
287 #ifndef CONFIG_DISCONTIGMEM
_alloc_pages(unsigned int gfp_mask,unsigned int order)288 struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order)
289 {
290 	return __alloc_pages(gfp_mask, order,
291 		contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
292 }
293 #endif
294 
295 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
balance_classzone(zone_t * classzone,unsigned int gfp_mask,unsigned int order,int * freed)296 static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
297 {
298 	struct page * page = NULL;
299 	int __freed;
300 
301 	if (in_interrupt())
302 		BUG();
303 
304 	current->allocation_order = order;
305 	current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
306 
307 	__freed = try_to_free_pages_zone(classzone, gfp_mask);
308 
309 	current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
310 
311 	if (current->nr_local_pages) {
312 		struct list_head * entry, * local_pages;
313 		struct page * tmp;
314 		int nr_pages;
315 
316 		local_pages = &current->local_pages;
317 
318 		if (likely(__freed)) {
319 			/* pick from the last inserted so we're lifo */
320 			entry = local_pages->next;
321 			do {
322 				tmp = list_entry(entry, struct page, list);
323 				if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
324 					list_del(entry);
325 					current->nr_local_pages--;
326 					set_page_count(tmp, 1);
327 					page = tmp;
328 
329 					if (page->buffers)
330 						BUG();
331 					if (page->mapping)
332 						BUG();
333 					if (!VALID_PAGE(page))
334 						BUG();
335 					if (PageLocked(page))
336 						BUG();
337 					if (PageLRU(page))
338 						BUG();
339 					if (PageActive(page))
340 						BUG();
341 					if (PageDirty(page))
342 						BUG();
343 
344 					break;
345 				}
346 			} while ((entry = entry->next) != local_pages);
347 		}
348 
349 		nr_pages = current->nr_local_pages;
350 		/* free in reverse order so that the global order will be lifo */
351 		while ((entry = local_pages->prev) != local_pages) {
352 			list_del(entry);
353 			tmp = list_entry(entry, struct page, list);
354 			__free_pages_ok(tmp, tmp->index);
355 			if (!nr_pages--)
356 				BUG();
357 		}
358 		current->nr_local_pages = 0;
359 	}
360 
361 	*freed = __freed;
362 	return page;
363 }
364 
zone_free_pages(zone_t * zone,unsigned int order)365 static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
366 {
367 	long free = zone->free_pages - (1UL << order);
368 	return free >= 0 ? free : 0;
369 }
370 
371 /*
372  * This is the 'heart' of the zoned buddy allocator:
373  */
__alloc_pages(unsigned int gfp_mask,unsigned int order,zonelist_t * zonelist)374 struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
375 {
376 	zone_t **zone, * classzone;
377 	struct page * page;
378 	int freed, class_idx;
379 
380 	zone = zonelist->zones;
381 	classzone = *zone;
382 	class_idx = zone_idx(classzone);
383 
384 	for (;;) {
385 		zone_t *z = *(zone++);
386 		if (!z)
387 			break;
388 
389 		if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
390 			page = rmqueue(z, order);
391 			if (page)
392 				return page;
393 		}
394 	}
395 
396 	classzone->need_balance = 1;
397 	mb();
398 	if (waitqueue_active(&kswapd_wait))
399 		wake_up_interruptible(&kswapd_wait);
400 
401 	zone = zonelist->zones;
402 	for (;;) {
403 		unsigned long min;
404 		zone_t *z = *(zone++);
405 		if (!z)
406 			break;
407 
408 		min = z->watermarks[class_idx].min;
409 		if (!(gfp_mask & __GFP_WAIT))
410 			min >>= 2;
411 		if (zone_free_pages(z, order) > min) {
412 			page = rmqueue(z, order);
413 			if (page)
414 				return page;
415 		}
416 	}
417 
418 	/* here we're in the low on memory slow path */
419 
420 	if ((current->flags & PF_MEMALLOC) &&
421 			(!in_interrupt() || (current->flags & PF_MEMDIE))) {
422 		zone = zonelist->zones;
423 		for (;;) {
424 			zone_t *z = *(zone++);
425 			if (!z)
426 				break;
427 
428 			page = rmqueue(z, order);
429 			if (page)
430 				return page;
431 		}
432 		return NULL;
433 	}
434 
435 	/* Atomic allocations - we can't balance anything */
436 	if (!(gfp_mask & __GFP_WAIT))
437 		goto out;
438 
439  rebalance:
440 	page = balance_classzone(classzone, gfp_mask, order, &freed);
441 	if (page)
442 		return page;
443 
444 	zone = zonelist->zones;
445 	if (likely(freed)) {
446 		for (;;) {
447 			zone_t *z = *(zone++);
448 			if (!z)
449 				break;
450 
451 			if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
452 				page = rmqueue(z, order);
453 				if (page)
454 					return page;
455 			}
456 		}
457 		goto rebalance;
458 	} else {
459 		/*
460 		 * Check that no other task is been killed meanwhile,
461 		 * in such a case we can succeed the allocation.
462 		 */
463 		for (;;) {
464 			zone_t *z = *(zone++);
465 			if (!z)
466 				break;
467 
468 			if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
469 				page = rmqueue(z, order);
470 				if (page)
471 					return page;
472 			}
473 		}
474 	}
475 
476  out:
477 	printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
478 	       order, gfp_mask, !!(current->flags & PF_MEMALLOC));
479 	if (unlikely(vm_gfp_debug))
480 		dump_stack();
481 	return NULL;
482 }
483 
484 /*
485  * Common helper functions.
486  */
__get_free_pages(unsigned int gfp_mask,unsigned int order)487 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
488 {
489 	struct page * page;
490 
491 	page = alloc_pages(gfp_mask, order);
492 	if (!page)
493 		return 0;
494 	return (unsigned long) page_address(page);
495 }
496 
get_zeroed_page(unsigned int gfp_mask)497 fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
498 {
499 	struct page * page;
500 
501 	page = alloc_pages(gfp_mask, 0);
502 	if (page) {
503 		void *address = page_address(page);
504 		clear_page(address);
505 		return (unsigned long) address;
506 	}
507 	return 0;
508 }
509 
__free_pages(struct page * page,unsigned int order)510 fastcall void __free_pages(struct page *page, unsigned int order)
511 {
512 	if (!PageReserved(page) && put_page_testzero(page))
513 		__free_pages_ok(page, order);
514 }
515 
free_pages(unsigned long addr,unsigned int order)516 fastcall void free_pages(unsigned long addr, unsigned int order)
517 {
518 	if (addr != 0)
519 		__free_pages(virt_to_page(addr), order);
520 }
521 
522 /*
523  * Total amount of free (allocatable) RAM:
524  */
nr_free_pages(void)525 unsigned int nr_free_pages (void)
526 {
527 	unsigned int sum = 0;
528 	zone_t *zone;
529 
530 	for_each_zone(zone)
531 		sum += zone->free_pages;
532 
533 	return sum;
534 }
535 
536 /*
537  * Amount of free RAM allocatable as buffer memory:
538  */
nr_free_buffer_pages(void)539 unsigned int nr_free_buffer_pages (void)
540 {
541 	pg_data_t *pgdat;
542 	unsigned int sum = 0;
543 	zonelist_t *zonelist;
544 	zone_t **zonep, *zone;
545 
546 	for_each_pgdat(pgdat) {
547 		int class_idx;
548 		zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
549 		zonep = zonelist->zones;
550 		zone = *zonep;
551 		class_idx = zone_idx(zone);
552 
553 		sum += zone->nr_cache_pages;
554 		for (; zone; zone = *zonep++) {
555 			int free = zone->free_pages - zone->watermarks[class_idx].high;
556 			if (free <= 0)
557 				continue;
558 			sum += free;
559 		}
560 	}
561 
562 	return sum;
563 }
564 
565 #if CONFIG_HIGHMEM
nr_free_highpages(void)566 unsigned int nr_free_highpages (void)
567 {
568 	pg_data_t *pgdat;
569 	unsigned int pages = 0;
570 
571 	for_each_pgdat(pgdat)
572 		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
573 
574 	return pages;
575 }
576 
freeable_lowmem(void)577 unsigned int freeable_lowmem(void)
578 {
579 	unsigned int pages = 0;
580 	pg_data_t *pgdat;
581 
582 	for_each_pgdat(pgdat) {
583 		pages += pgdat->node_zones[ZONE_DMA].free_pages;
584 		pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
585 		pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
586 		pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
587 		pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
588 		pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
589 	}
590 
591 	return pages;
592 }
593 #endif
594 
595 #define K(x) ((x) << (PAGE_SHIFT-10))
596 
597 /*
598  * Show free area list (used inside shift_scroll-lock stuff)
599  * We also calculate the percentage fragmentation. We do this by counting the
600  * memory on each free list with the exception of the first item on the list.
601  */
show_free_areas_core(pg_data_t * pgdat)602 void show_free_areas_core(pg_data_t *pgdat)
603 {
604  	unsigned int order;
605 	unsigned type;
606 	pg_data_t *tmpdat = pgdat;
607 
608 	printk("Free pages:      %6dkB (%6dkB HighMem)\n",
609 		K(nr_free_pages()),
610 		K(nr_free_highpages()));
611 
612 	while (tmpdat) {
613 		zone_t *zone;
614 		for (zone = tmpdat->node_zones;
615 			       	zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
616 			printk("Zone:%s freepages:%6lukB\n",
617 					zone->name,
618 					K(zone->free_pages));
619 
620 		tmpdat = tmpdat->node_next;
621 	}
622 
623 	printk("( Active: %d, inactive: %d, free: %d )\n",
624 	       nr_active_pages,
625 	       nr_inactive_pages,
626 	       nr_free_pages());
627 
628 	for (type = 0; type < MAX_NR_ZONES; type++) {
629 		struct list_head *head, *curr;
630 		zone_t *zone = pgdat->node_zones + type;
631  		unsigned long nr, total, flags;
632 
633 		total = 0;
634 		if (zone->size) {
635 			spin_lock_irqsave(&zone->lock, flags);
636 		 	for (order = 0; order < MAX_ORDER; order++) {
637 				head = &(zone->free_area + order)->free_list;
638 				curr = head;
639 				nr = 0;
640 				for (;;) {
641 					if ((curr = curr->next) == head)
642 						break;
643 					nr++;
644 				}
645 				total += nr * (1 << order);
646 				printk("%lu*%lukB ", nr, K(1UL) << order);
647 			}
648 			spin_unlock_irqrestore(&zone->lock, flags);
649 		}
650 		printk("= %lukB)\n", K(total));
651 	}
652 
653 #ifdef SWAP_CACHE_INFO
654 	show_swap_cache_info();
655 #endif
656 }
657 
show_free_areas(void)658 void show_free_areas(void)
659 {
660 	show_free_areas_core(pgdat_list);
661 }
662 
663 /*
664  * Builds allocation fallback zone lists.
665  */
build_zonelists(pg_data_t * pgdat)666 static inline void build_zonelists(pg_data_t *pgdat)
667 {
668 	int i, j, k;
669 
670 	for (i = 0; i <= GFP_ZONEMASK; i++) {
671 		zonelist_t *zonelist;
672 		zone_t *zone;
673 
674 		zonelist = pgdat->node_zonelists + i;
675 		memset(zonelist, 0, sizeof(*zonelist));
676 
677 		j = 0;
678 		k = ZONE_NORMAL;
679 		if (i & __GFP_HIGHMEM)
680 			k = ZONE_HIGHMEM;
681 		if (i & __GFP_DMA)
682 			k = ZONE_DMA;
683 
684 		switch (k) {
685 			default:
686 				BUG();
687 			/*
688 			 * fallthrough:
689 			 */
690 			case ZONE_HIGHMEM:
691 				zone = pgdat->node_zones + ZONE_HIGHMEM;
692 				if (zone->size) {
693 #ifndef CONFIG_HIGHMEM
694 					BUG();
695 #endif
696 					zonelist->zones[j++] = zone;
697 				}
698 			case ZONE_NORMAL:
699 				zone = pgdat->node_zones + ZONE_NORMAL;
700 				if (zone->size)
701 					zonelist->zones[j++] = zone;
702 			case ZONE_DMA:
703 				zone = pgdat->node_zones + ZONE_DMA;
704 				if (zone->size)
705 					zonelist->zones[j++] = zone;
706 		}
707 		zonelist->zones[j++] = NULL;
708 	}
709 }
710 
711 /*
712  * Helper functions to size the waitqueue hash table.
713  * Essentially these want to choose hash table sizes sufficiently
714  * large so that collisions trying to wait on pages are rare.
715  * But in fact, the number of active page waitqueues on typical
716  * systems is ridiculously low, less than 200. So this is even
717  * conservative, even though it seems large.
718  *
719  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
720  * waitqueues, i.e. the size of the waitq table given the number of pages.
721  */
722 #define PAGES_PER_WAITQUEUE	256
723 
wait_table_size(unsigned long pages)724 static inline unsigned long wait_table_size(unsigned long pages)
725 {
726 	unsigned long size = 1;
727 
728 	pages /= PAGES_PER_WAITQUEUE;
729 
730 	while (size < pages)
731 		size <<= 1;
732 
733 	/*
734 	 * Once we have dozens or even hundreds of threads sleeping
735 	 * on IO we've got bigger problems than wait queue collision.
736 	 * Limit the size of the wait table to a reasonable size.
737 	 */
738 	size = min(size, 4096UL);
739 
740 	return size;
741 }
742 
743 /*
744  * This is an integer logarithm so that shifts can be used later
745  * to extract the more random high bits from the multiplicative
746  * hash function before the remainder is taken.
747  */
wait_table_bits(unsigned long size)748 static inline unsigned long wait_table_bits(unsigned long size)
749 {
750 	return ffz(~size);
751 }
752 
753 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
754 
755 /*
756  * Set up the zone data structures:
757  *   - mark all pages reserved
758  *   - mark all memory queues empty
759  *   - clear the memory bitmaps
760  */
free_area_init_core(int nid,pg_data_t * pgdat,struct page ** gmap,unsigned long * zones_size,unsigned long zone_start_paddr,unsigned long * zholes_size,struct page * lmem_map)761 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
762 	unsigned long *zones_size, unsigned long zone_start_paddr,
763 	unsigned long *zholes_size, struct page *lmem_map)
764 {
765 	unsigned long i, j;
766 	unsigned long map_size;
767 	unsigned long totalpages, offset, realtotalpages;
768 	const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
769 
770 	if (zone_start_paddr & ~PAGE_MASK)
771 		BUG();
772 
773 	totalpages = 0;
774 	for (i = 0; i < MAX_NR_ZONES; i++) {
775 		unsigned long size = zones_size[i];
776 		totalpages += size;
777 	}
778 	realtotalpages = totalpages;
779 	if (zholes_size)
780 		for (i = 0; i < MAX_NR_ZONES; i++)
781 			realtotalpages -= zholes_size[i];
782 
783 	printk("On node %d totalpages: %lu\n", nid, realtotalpages);
784 
785 	/*
786 	 * Some architectures (with lots of mem and discontinous memory
787 	 * maps) have to search for a good mem_map area:
788 	 * For discontigmem, the conceptual mem map array starts from
789 	 * PAGE_OFFSET, we need to align the actual array onto a mem map
790 	 * boundary, so that MAP_NR works.
791 	 */
792 	map_size = (totalpages + 1)*sizeof(struct page);
793 	if (lmem_map == (struct page *)0) {
794 		lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
795 		lmem_map = (struct page *)(PAGE_OFFSET +
796 			MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
797 	}
798 	*gmap = pgdat->node_mem_map = lmem_map;
799 	pgdat->node_size = totalpages;
800 	pgdat->node_start_paddr = zone_start_paddr;
801 	pgdat->node_start_mapnr = (lmem_map - mem_map);
802 	pgdat->nr_zones = 0;
803 
804 	offset = lmem_map - mem_map;
805 	for (j = 0; j < MAX_NR_ZONES; j++) {
806 		zone_t *zone = pgdat->node_zones + j;
807 		unsigned long mask;
808 		unsigned long size, realsize;
809 		int idx;
810 
811 		zone_table[nid * MAX_NR_ZONES + j] = zone;
812 		realsize = size = zones_size[j];
813 		if (zholes_size)
814 			realsize -= zholes_size[j];
815 
816 		printk("zone(%lu): %lu pages.\n", j, size);
817 		zone->size = size;
818 		zone->realsize = realsize;
819 		zone->name = zone_names[j];
820 		zone->lock = SPIN_LOCK_UNLOCKED;
821 		zone->zone_pgdat = pgdat;
822 		zone->free_pages = 0;
823 		zone->need_balance = 0;
824 		 zone->nr_active_pages = zone->nr_inactive_pages = 0;
825 
826 
827 		if (!size)
828 			continue;
829 
830 		/*
831 		 * The per-page waitqueue mechanism uses hashed waitqueues
832 		 * per zone.
833 		 */
834 		zone->wait_table_size = wait_table_size(size);
835 		zone->wait_table_shift =
836 			BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
837 		zone->wait_table = (wait_queue_head_t *)
838 			alloc_bootmem_node(pgdat, zone->wait_table_size
839 						* sizeof(wait_queue_head_t));
840 
841 		for(i = 0; i < zone->wait_table_size; ++i)
842 			init_waitqueue_head(zone->wait_table + i);
843 
844 		pgdat->nr_zones = j+1;
845 
846 		mask = (realsize / zone_balance_ratio[j]);
847 		if (mask < zone_balance_min[j])
848 			mask = zone_balance_min[j];
849 		else if (mask > zone_balance_max[j])
850 			mask = zone_balance_max[j];
851 		zone->watermarks[j].min = mask;
852 		zone->watermarks[j].low = mask*2;
853 		zone->watermarks[j].high = mask*3;
854 		/* now set the watermarks of the lower zones in the "j" classzone */
855 		for (idx = j-1; idx >= 0; idx--) {
856 			zone_t * lower_zone = pgdat->node_zones + idx;
857 			unsigned long lower_zone_reserve;
858 			if (!lower_zone->size)
859 				continue;
860 
861 			mask = lower_zone->watermarks[idx].min;
862 			lower_zone->watermarks[j].min = mask;
863 			lower_zone->watermarks[j].low = mask*2;
864 			lower_zone->watermarks[j].high = mask*3;
865 
866 			/* now the brainer part */
867 			lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
868 			lower_zone->watermarks[j].min += lower_zone_reserve;
869 			lower_zone->watermarks[j].low += lower_zone_reserve;
870 			lower_zone->watermarks[j].high += lower_zone_reserve;
871 
872 			realsize += lower_zone->realsize;
873 		}
874 
875 		zone->zone_mem_map = mem_map + offset;
876 		zone->zone_start_mapnr = offset;
877 		zone->zone_start_paddr = zone_start_paddr;
878 
879 		if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
880 			printk("BUG: wrong zone alignment, it will crash\n");
881 
882 		/*
883 		 * Initially all pages are reserved - free ones are freed
884 		 * up by free_all_bootmem() once the early boot process is
885 		 * done. Non-atomic initialization, single-pass.
886 		 */
887 		for (i = 0; i < size; i++) {
888 			struct page *page = mem_map + offset + i;
889 			set_page_zone(page, nid * MAX_NR_ZONES + j);
890 			set_page_count(page, 0);
891 			SetPageReserved(page);
892 			INIT_LIST_HEAD(&page->list);
893 			if (j != ZONE_HIGHMEM)
894 				set_page_address(page, __va(zone_start_paddr));
895 			zone_start_paddr += PAGE_SIZE;
896 		}
897 
898 		offset += size;
899 		for (i = 0; ; i++) {
900 			unsigned long bitmap_size;
901 
902 			INIT_LIST_HEAD(&zone->free_area[i].free_list);
903 			if (i == MAX_ORDER-1) {
904 				zone->free_area[i].map = NULL;
905 				break;
906 			}
907 
908 			/*
909 			 * Page buddy system uses "index >> (i+1)",
910 			 * where "index" is at most "size-1".
911 			 *
912 			 * The extra "+3" is to round down to byte
913 			 * size (8 bits per byte assumption). Thus
914 			 * we get "(size-1) >> (i+4)" as the last byte
915 			 * we can access.
916 			 *
917 			 * The "+1" is because we want to round the
918 			 * byte allocation up rather than down. So
919 			 * we should have had a "+7" before we shifted
920 			 * down by three. Also, we have to add one as
921 			 * we actually _use_ the last bit (it's [0,n]
922 			 * inclusive, not [0,n[).
923 			 *
924 			 * So we actually had +7+1 before we shift
925 			 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
926 			 * (modulo overflows, which we do not have).
927 			 *
928 			 * Finally, we LONG_ALIGN because all bitmap
929 			 * operations are on longs.
930 			 */
931 			bitmap_size = (size-1) >> (i+4);
932 			bitmap_size = LONG_ALIGN(bitmap_size+1);
933 			zone->free_area[i].map =
934 			  (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
935 		}
936 	}
937 	build_zonelists(pgdat);
938 }
939 
free_area_init(unsigned long * zones_size)940 void __init free_area_init(unsigned long *zones_size)
941 {
942 	free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
943 }
944 
setup_mem_frac(char * str)945 static int __init setup_mem_frac(char *str)
946 {
947 	int j = 0;
948 
949 	while (get_option(&str, &zone_balance_ratio[j++]) == 2);
950 	printk("setup_mem_frac: ");
951 	for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
952 	printk("\n");
953 	return 1;
954 }
955 
956 __setup("memfrac=", setup_mem_frac);
957 
setup_lower_zone_reserve(char * str)958 static int __init setup_lower_zone_reserve(char *str)
959 {
960 	int j = 0;
961 
962 	while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
963 	printk("setup_lower_zone_reserve: ");
964 	for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
965 	printk("\n");
966 	return 1;
967 }
968 
969 __setup("lower_zone_reserve=", setup_lower_zone_reserve);
970