1 #ifndef _LINUX_MMZONE_H
2 #define _LINUX_MMZONE_H
3 
4 #ifdef __KERNEL__
5 #ifndef __ASSEMBLY__
6 
7 #include <linux/config.h>
8 #include <linux/spinlock.h>
9 #include <linux/list.h>
10 #include <linux/wait.h>
11 
12 /*
13  * Free memory management - zoned buddy allocator.
14  */
15 
16 #ifndef CONFIG_FORCE_MAX_ZONEORDER
17 #define MAX_ORDER 10
18 #else
19 #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
20 #endif
21 
22 #define ZONE_DMA               0
23 #define ZONE_NORMAL            1
24 #define ZONE_HIGHMEM           2
25 #define MAX_NR_ZONES           3
26 
27 typedef struct free_area_struct {
28 	struct list_head	free_list;
29 	unsigned long		*map;
30 } free_area_t;
31 
32 struct pglist_data;
33 
34 typedef struct zone_watermarks_s {
35 	unsigned long min, low, high;
36 } zone_watermarks_t;
37 
38 
39 /*
40  * On machines where it is needed (eg PCs) we divide physical memory
41  * into multiple physical zones. On a PC we have 3 zones:
42  *
43  * ZONE_DMA	  < 16 MB	ISA DMA capable memory
44  * ZONE_NORMAL	16-896 MB	direct mapped by the kernel
45  * ZONE_HIGHMEM	 > 896 MB	only page cache and user processes
46  */
47 typedef struct zone_struct {
48 	/*
49 	 * Commonly accessed fields:
50 	 */
51 	spinlock_t		lock;
52 	unsigned long		free_pages;
53 	/*
54 	 * We don't know if the memory that we're going to allocate will be freeable
55 	 * or/and it will be released eventually, so to avoid totally wasting several
56 	 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
57 	 * to run OOM on the lower zones despite there's tons of freeable ram
58 	 * on the higher zones).
59 	 */
60 	zone_watermarks_t       watermarks[MAX_NR_ZONES];
61 
62 	/*
63 	 * The below fields are protected by different locks (or by
64 	 * no lock at all like need_balance), so they're longs to
65 	 * provide an atomic granularity against each other on
66 	 * all architectures.
67 	 */
68 	unsigned long           need_balance;
69 	/* protected by the pagemap_lru_lock */
70 	unsigned long           nr_active_pages, nr_inactive_pages;
71 	/* protected by the pagecache_lock */
72 	unsigned long           nr_cache_pages;
73 
74 
75 	/*
76 	 * free areas of different sizes
77 	 */
78 	free_area_t		free_area[MAX_ORDER];
79 
80 	/*
81 	 * wait_table		-- the array holding the hash table
82 	 * wait_table_size	-- the size of the hash table array
83 	 * wait_table_shift	-- wait_table_size
84 	 * 				== BITS_PER_LONG (1 << wait_table_bits)
85 	 *
86 	 * The purpose of all these is to keep track of the people
87 	 * waiting for a page to become available and make them
88 	 * runnable again when possible. The trouble is that this
89 	 * consumes a lot of space, especially when so few things
90 	 * wait on pages at a given time. So instead of using
91 	 * per-page waitqueues, we use a waitqueue hash table.
92 	 *
93 	 * The bucket discipline is to sleep on the same queue when
94 	 * colliding and wake all in that wait queue when removing.
95 	 * When something wakes, it must check to be sure its page is
96 	 * truly available, a la thundering herd. The cost of a
97 	 * collision is great, but given the expected load of the
98 	 * table, they should be so rare as to be outweighed by the
99 	 * benefits from the saved space.
100 	 *
101 	 * __wait_on_page() and unlock_page() in mm/filemap.c, are the
102 	 * primary users of these fields, and in mm/page_alloc.c
103 	 * free_area_init_core() performs the initialization of them.
104 	 */
105 	wait_queue_head_t	* wait_table;
106 	unsigned long		wait_table_size;
107 	unsigned long		wait_table_shift;
108 
109 	/*
110 	 * Discontig memory support fields.
111 	 */
112 	struct pglist_data	*zone_pgdat;
113 	struct page		*zone_mem_map;
114 	unsigned long		zone_start_paddr;
115 	unsigned long		zone_start_mapnr;
116 
117 	/*
118 	 * rarely used fields:
119 	 */
120 	char			*name;
121 	unsigned long		size;
122 	unsigned long		realsize;
123 } zone_t;
124 
125 /*
126  * One allocation request operates on a zonelist. A zonelist
127  * is a list of zones, the first one is the 'goal' of the
128  * allocation, the other zones are fallback zones, in decreasing
129  * priority.
130  *
131  * Right now a zonelist takes up less than a cacheline. We never
132  * modify it apart from boot-up, and only a few indices are used,
133  * so despite the zonelist table being relatively big, the cache
134  * footprint of this construct is very small.
135  */
136 typedef struct zonelist_struct {
137 	zone_t * zones [MAX_NR_ZONES+1]; // NULL delimited
138 } zonelist_t;
139 
140 #define GFP_ZONEMASK	0x0f
141 
142 /*
143  * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM
144  * (mostly NUMA machines?) to denote a higher-level memory zone than the
145  * zone_struct denotes.
146  *
147  * On NUMA machines, each NUMA node would have a pg_data_t to describe
148  * it's memory layout.
149  *
150  * XXX: we need to move the global memory statistics (active_list, ...)
151  *      into the pg_data_t to properly support NUMA.
152  */
153 struct bootmem_data;
154 typedef struct pglist_data {
155 	zone_t node_zones[MAX_NR_ZONES];
156 	zonelist_t node_zonelists[GFP_ZONEMASK+1];
157 	int nr_zones;
158 	struct page *node_mem_map;
159 	unsigned long *valid_addr_bitmap;
160 	struct bootmem_data *bdata;
161 	unsigned long node_start_paddr;
162 	unsigned long node_start_mapnr;
163 	unsigned long node_size;
164 	int node_id;
165 	struct pglist_data *node_next;
166 } pg_data_t;
167 
168 extern int numnodes;
169 extern pg_data_t *pgdat_list;
170 
171 #define zone_idx(zone)                 ((zone) - (zone)->zone_pgdat->node_zones)
172 #define memclass(pgzone, classzone)    (zone_idx(pgzone) <= zone_idx(classzone))
173 
174 /*
175  * The following two are not meant for general usage. They are here as
176  * prototypes for the discontig memory code.
177  */
178 struct page;
179 extern void show_free_areas_core(pg_data_t *pgdat);
180 extern void free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
181   unsigned long *zones_size, unsigned long paddr, unsigned long *zholes_size,
182   struct page *pmap);
183 
184 extern pg_data_t contig_page_data;
185 
186 /**
187  * for_each_pgdat - helper macro to iterate over all nodes
188  * @pgdat - pg_data_t * variable
189  *
190  * Meant to help with common loops of the form
191  * pgdat = pgdat_list;
192  * while(pgdat) {
193  * 	...
194  * 	pgdat = pgdat->node_next;
195  * }
196  */
197 #define for_each_pgdat(pgdat) \
198 	for (pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
199 
200 
201 /*
202  * next_zone - helper magic for for_each_zone()
203  * Thanks to William Lee Irwin III for this piece of ingenuity.
204  */
next_zone(zone_t * zone)205 static inline zone_t *next_zone(zone_t *zone)
206 {
207 	pg_data_t *pgdat = zone->zone_pgdat;
208 
209 	if (zone - pgdat->node_zones < MAX_NR_ZONES - 1)
210 		zone++;
211 
212 	else if (pgdat->node_next) {
213 		pgdat = pgdat->node_next;
214 		zone = pgdat->node_zones;
215 	} else
216 		zone = NULL;
217 
218 	return zone;
219 }
220 
221 /**
222  * for_each_zone - helper macro to iterate over all memory zones
223  * @zone - zone_t * variable
224  *
225  * The user only needs to declare the zone variable, for_each_zone
226  * fills it in. This basically means for_each_zone() is an
227  * easier to read version of this piece of code:
228  *
229  * for(pgdat = pgdat_list; pgdat; pgdat = pgdat->node_next)
230  * 	for(i = 0; i < MAX_NR_ZONES; ++i) {
231  * 		zone_t * z = pgdat->node_zones + i;
232  * 		...
233  * 	}
234  * }
235  */
236 #define for_each_zone(zone) \
237 	for(zone = pgdat_list->node_zones; zone; zone = next_zone(zone))
238 
239 
240 #ifndef CONFIG_DISCONTIGMEM
241 
242 #define NODE_DATA(nid)		(&contig_page_data)
243 #define NODE_MEM_MAP(nid)	mem_map
244 #define MAX_NR_NODES		1
245 
246 #else /* !CONFIG_DISCONTIGMEM */
247 
248 #include <asm/mmzone.h>
249 
250 /* page->zone is currently 8 bits ... */
251 #ifndef MAX_NR_NODES
252 #define MAX_NR_NODES		(255 / MAX_NR_ZONES)
253 #endif
254 
255 #endif /* !CONFIG_DISCONTIGMEM */
256 
257 #define MAP_ALIGN(x)	((((x) % sizeof(mem_map_t)) == 0) ? (x) : ((x) + \
258 		sizeof(mem_map_t) - ((x) % sizeof(mem_map_t))))
259 
260 #endif /* !__ASSEMBLY__ */
261 #endif /* __KERNEL__ */
262 #endif /* _LINUX_MMZONE_H */
263