1 /*
2  * High memory handling common code and variables.
3  *
4  * (C) 1999 Andrea Arcangeli, SuSE GmbH, andrea@suse.de
5  *          Gerhard Wichert, Siemens AG, Gerhard.Wichert@pdb.siemens.de
6  *
7  *
8  * Redesigned the x86 32-bit VM architecture to deal with
9  * 64-bit physical space. With current x86 CPUs this
10  * means up to 64 Gigabytes physical RAM.
11  *
12  * Rewrote high memory support to move the page cache into
13  * high memory. Implemented permanent (schedulable) kmaps
14  * based on Linus' idea.
15  *
16  * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
17  */
18 
19 #include <linux/mm.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/swap.h>
23 #include <linux/slab.h>
24 
25 /*
26  * Virtual_count is not a pure "count".
27  *  0 means that it is not mapped, and has not been mapped
28  *    since a TLB flush - it is usable.
29  *  1 means that there are no users, but it has been mapped
30  *    since the last TLB flush - so we can't use it.
31  *  n means that there are (n-1) current users of it.
32  */
33 static int pkmap_count[LAST_PKMAP];
34 static unsigned int last_pkmap_nr;
35 static spinlock_cacheline_t kmap_lock_cacheline = {SPIN_LOCK_UNLOCKED};
36 #define kmap_lock  kmap_lock_cacheline.lock
37 
38 pte_t * pkmap_page_table;
39 
40 static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
41 
flush_all_zero_pkmaps(void)42 static void flush_all_zero_pkmaps(void)
43 {
44 	int i;
45 
46 	flush_cache_all();
47 
48 	for (i = 0; i < LAST_PKMAP; i++) {
49 		struct page *page;
50 
51 		/*
52 		 * zero means we don't have anything to do,
53 		 * >1 means that it is still in use. Only
54 		 * a count of 1 means that it is free but
55 		 * needs to be unmapped
56 		 */
57 		if (pkmap_count[i] != 1)
58 			continue;
59 		pkmap_count[i] = 0;
60 
61 		/* sanity check */
62 		if (pte_none(pkmap_page_table[i]))
63 			BUG();
64 
65 		/*
66 		 * Don't need an atomic fetch-and-clear op here;
67 		 * no-one has the page mapped, and cannot get at
68 		 * its virtual address (and hence PTE) without first
69 		 * getting the kmap_lock (which is held here).
70 		 * So no dangers, even with speculative execution.
71 		 */
72 		page = pte_page(pkmap_page_table[i]);
73 		pte_clear(&pkmap_page_table[i]);
74 
75 		page->virtual = NULL;
76 	}
77 	flush_tlb_all();
78 }
79 
map_new_virtual(struct page * page,int nonblocking)80 static inline unsigned long map_new_virtual(struct page *page, int nonblocking)
81 {
82 	unsigned long vaddr;
83 	int count;
84 
85 start:
86 	count = LAST_PKMAP;
87 	/* Find an empty entry */
88 	for (;;) {
89 		last_pkmap_nr = (last_pkmap_nr + 1) & LAST_PKMAP_MASK;
90 		if (!last_pkmap_nr) {
91 			flush_all_zero_pkmaps();
92 			count = LAST_PKMAP;
93 		}
94 		if (!pkmap_count[last_pkmap_nr])
95 			break;	/* Found a usable entry */
96 		if (--count)
97 			continue;
98 
99 		if (nonblocking)
100 			return 0;
101 
102 		/*
103 		 * Sleep for somebody else to unmap their entries
104 		 */
105 		{
106 			DECLARE_WAITQUEUE(wait, current);
107 
108 			current->state = TASK_UNINTERRUPTIBLE;
109 			add_wait_queue(&pkmap_map_wait, &wait);
110 			spin_unlock(&kmap_lock);
111 			schedule();
112 			remove_wait_queue(&pkmap_map_wait, &wait);
113 			spin_lock(&kmap_lock);
114 
115 			/* Somebody else might have mapped it while we slept */
116 			if (page->virtual)
117 				return (unsigned long) page->virtual;
118 
119 			/* Re-start */
120 			goto start;
121 		}
122 	}
123 	vaddr = PKMAP_ADDR(last_pkmap_nr);
124 	set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot));
125 
126 	pkmap_count[last_pkmap_nr] = 1;
127 	page->virtual = (void *) vaddr;
128 
129 	return vaddr;
130 }
131 
kmap_high(struct page * page,int nonblocking)132 void fastcall *kmap_high(struct page *page, int nonblocking)
133 {
134 	unsigned long vaddr;
135 
136 	/*
137 	 * For highmem pages, we can't trust "virtual" until
138 	 * after we have the lock.
139 	 *
140 	 * We cannot call this from interrupts, as it may block
141 	 */
142 	spin_lock(&kmap_lock);
143 	vaddr = (unsigned long) page->virtual;
144 	if (!vaddr) {
145 		vaddr = map_new_virtual(page, nonblocking);
146 		if (!vaddr)
147 			goto out;
148 	}
149 	pkmap_count[PKMAP_NR(vaddr)]++;
150 	if (pkmap_count[PKMAP_NR(vaddr)] < 2)
151 		BUG();
152  out:
153 	spin_unlock(&kmap_lock);
154 	return (void*) vaddr;
155 }
156 
kunmap_high(struct page * page)157 void fastcall kunmap_high(struct page *page)
158 {
159 	unsigned long vaddr;
160 	unsigned long nr;
161 	int need_wakeup;
162 
163 	spin_lock(&kmap_lock);
164 	vaddr = (unsigned long) page->virtual;
165 	if (!vaddr)
166 		BUG();
167 	nr = PKMAP_NR(vaddr);
168 
169 	/*
170 	 * A count must never go down to zero
171 	 * without a TLB flush!
172 	 */
173 	need_wakeup = 0;
174 	switch (--pkmap_count[nr]) {
175 	case 0:
176 		BUG();
177 	case 1:
178 		/*
179 		 * Avoid an unnecessary wake_up() function call.
180 		 * The common case is pkmap_count[] == 1, but
181 		 * no waiters.
182 		 * The tasks queued in the wait-queue are guarded
183 		 * by both the lock in the wait-queue-head and by
184 		 * the kmap_lock.  As the kmap_lock is held here,
185 		 * no need for the wait-queue-head's lock.  Simply
186 		 * test if the queue is empty.
187 		 */
188 		need_wakeup = waitqueue_active(&pkmap_map_wait);
189 	}
190 	spin_unlock(&kmap_lock);
191 
192 	/* do wake-up, if needed, race-free outside of the spin lock */
193 	if (need_wakeup)
194 		wake_up(&pkmap_map_wait);
195 }
196 
197 #define POOL_SIZE 32
198 
199 /*
200  * This lock gets no contention at all, normally.
201  */
202 static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
203 
204 int nr_emergency_pages;
205 static LIST_HEAD(emergency_pages);
206 
207 int nr_emergency_bhs;
208 static LIST_HEAD(emergency_bhs);
209 
210 /*
211  * Simple bounce buffer support for highmem pages.
212  * This will be moved to the block layer in 2.5.
213  */
214 
copy_from_high_bh(struct buffer_head * to,struct buffer_head * from)215 static inline void copy_from_high_bh (struct buffer_head *to,
216 			 struct buffer_head *from)
217 {
218 	struct page *p_from;
219 	char *vfrom;
220 
221 	p_from = from->b_page;
222 
223 	vfrom = kmap_atomic(p_from, KM_USER0);
224 	memcpy(to->b_data, vfrom + bh_offset(from), to->b_size);
225 	kunmap_atomic(vfrom, KM_USER0);
226 }
227 
copy_to_high_bh_irq(struct buffer_head * to,struct buffer_head * from)228 static inline void copy_to_high_bh_irq (struct buffer_head *to,
229 			 struct buffer_head *from)
230 {
231 	struct page *p_to;
232 	char *vto;
233 	unsigned long flags;
234 
235 	p_to = to->b_page;
236 	__save_flags(flags);
237 	__cli();
238 	vto = kmap_atomic(p_to, KM_BOUNCE_READ);
239 	memcpy(vto + bh_offset(to), from->b_data, to->b_size);
240 	kunmap_atomic(vto, KM_BOUNCE_READ);
241 	__restore_flags(flags);
242 }
243 
bounce_end_io(struct buffer_head * bh,int uptodate)244 static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
245 {
246 	struct page *page;
247 	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
248 	unsigned long flags;
249 
250 	bh_orig->b_end_io(bh_orig, uptodate);
251 
252 	page = bh->b_page;
253 
254 	spin_lock_irqsave(&emergency_lock, flags);
255 	if (nr_emergency_pages >= POOL_SIZE)
256 		__free_page(page);
257 	else {
258 		/*
259 		 * We are abusing page->list to manage
260 		 * the highmem emergency pool:
261 		 */
262 		list_add(&page->list, &emergency_pages);
263 		nr_emergency_pages++;
264 	}
265 
266 	if (nr_emergency_bhs >= POOL_SIZE) {
267 #ifdef HIGHMEM_DEBUG
268 		/* Don't clobber the constructed slab cache */
269 		init_waitqueue_head(&bh->b_wait);
270 #endif
271 		kmem_cache_free(bh_cachep, bh);
272 	} else {
273 		/*
274 		 * Ditto in the bh case, here we abuse b_inode_buffers:
275 		 */
276 		list_add(&bh->b_inode_buffers, &emergency_bhs);
277 		nr_emergency_bhs++;
278 	}
279 	spin_unlock_irqrestore(&emergency_lock, flags);
280 }
281 
init_emergency_pool(void)282 static __init int init_emergency_pool(void)
283 {
284 	struct sysinfo i;
285         si_meminfo(&i);
286         si_swapinfo(&i);
287 
288         if (!i.totalhigh)
289         	return 0;
290 
291 	spin_lock_irq(&emergency_lock);
292 	while (nr_emergency_pages < POOL_SIZE) {
293 		struct page * page = alloc_page(GFP_ATOMIC);
294 		if (!page) {
295 			printk("couldn't refill highmem emergency pages");
296 			break;
297 		}
298 		list_add(&page->list, &emergency_pages);
299 		nr_emergency_pages++;
300 	}
301 	while (nr_emergency_bhs < POOL_SIZE) {
302 		struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
303 		if (!bh) {
304 			printk("couldn't refill highmem emergency bhs");
305 			break;
306 		}
307 		list_add(&bh->b_inode_buffers, &emergency_bhs);
308 		nr_emergency_bhs++;
309 	}
310 	spin_unlock_irq(&emergency_lock);
311 	printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
312 	       nr_emergency_pages, nr_emergency_bhs);
313 
314 	return 0;
315 }
316 
317 __initcall(init_emergency_pool);
318 
bounce_end_io_write(struct buffer_head * bh,int uptodate)319 static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
320 {
321 	bounce_end_io(bh, uptodate);
322 }
323 
bounce_end_io_read(struct buffer_head * bh,int uptodate)324 static void bounce_end_io_read (struct buffer_head *bh, int uptodate)
325 {
326 	struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
327 
328 	if (uptodate)
329 		copy_to_high_bh_irq(bh_orig, bh);
330 	bounce_end_io(bh, uptodate);
331 }
332 
alloc_bounce_page(void)333 struct page *alloc_bounce_page (void)
334 {
335 	struct list_head *tmp;
336 	struct page *page;
337 
338 	page = alloc_page(GFP_NOHIGHIO);
339 	if (page)
340 		return page;
341 	/*
342 	 * No luck. First, kick the VM so it doesn't idle around while
343 	 * we are using up our emergency rations.
344 	 */
345 	wakeup_bdflush();
346 
347 repeat_alloc:
348 	/*
349 	 * Try to allocate from the emergency pool.
350 	 */
351 	tmp = &emergency_pages;
352 	spin_lock_irq(&emergency_lock);
353 	if (!list_empty(tmp)) {
354 		page = list_entry(tmp->next, struct page, list);
355 		list_del(tmp->next);
356 		nr_emergency_pages--;
357 	}
358 	spin_unlock_irq(&emergency_lock);
359 	if (page)
360 		return page;
361 
362 	/* we need to wait I/O completion */
363 	run_task_queue(&tq_disk);
364 
365 	yield();
366 	goto repeat_alloc;
367 }
368 
alloc_bounce_bh(void)369 struct buffer_head *alloc_bounce_bh (void)
370 {
371 	struct list_head *tmp;
372 	struct buffer_head *bh;
373 
374 	bh = kmem_cache_alloc(bh_cachep, SLAB_NOHIGHIO);
375 	if (bh)
376 		return bh;
377 	/*
378 	 * No luck. First, kick the VM so it doesn't idle around while
379 	 * we are using up our emergency rations.
380 	 */
381 	wakeup_bdflush();
382 
383 repeat_alloc:
384 	/*
385 	 * Try to allocate from the emergency pool.
386 	 */
387 	tmp = &emergency_bhs;
388 	spin_lock_irq(&emergency_lock);
389 	if (!list_empty(tmp)) {
390 		bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
391 		list_del(tmp->next);
392 		nr_emergency_bhs--;
393 	}
394 	spin_unlock_irq(&emergency_lock);
395 	if (bh)
396 		return bh;
397 
398 	/* we need to wait I/O completion */
399 	run_task_queue(&tq_disk);
400 
401 	yield();
402 	goto repeat_alloc;
403 }
404 
create_bounce(int rw,struct buffer_head * bh_orig)405 struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
406 {
407 	struct page *page;
408 	struct buffer_head *bh;
409 
410 	if (!PageHighMem(bh_orig->b_page))
411 		return bh_orig;
412 
413 	bh = alloc_bounce_bh();
414 	/*
415 	 * This is wasteful for 1k buffers, but this is a stopgap measure
416 	 * and we are being ineffective anyway. This approach simplifies
417 	 * things immensly. On boxes with more than 4GB RAM this should
418 	 * not be an issue anyway.
419 	 */
420 	page = alloc_bounce_page();
421 
422 	set_bh_page(bh, page, 0);
423 
424 	bh->b_next = NULL;
425 	bh->b_blocknr = bh_orig->b_blocknr;
426 	bh->b_size = bh_orig->b_size;
427 	bh->b_list = -1;
428 	bh->b_dev = bh_orig->b_dev;
429 	bh->b_count = bh_orig->b_count;
430 	bh->b_rdev = bh_orig->b_rdev;
431 	bh->b_state = bh_orig->b_state;
432 #ifdef HIGHMEM_DEBUG
433 	bh->b_flushtime = jiffies;
434 	bh->b_next_free = NULL;
435 	bh->b_prev_free = NULL;
436 	/* bh->b_this_page */
437 	bh->b_reqnext = NULL;
438 	bh->b_pprev = NULL;
439 #endif
440 	/* bh->b_page */
441 	if (rw == WRITE) {
442 		bh->b_end_io = bounce_end_io_write;
443 		copy_from_high_bh(bh, bh_orig);
444 	} else
445 		bh->b_end_io = bounce_end_io_read;
446 	bh->b_private = (void *)bh_orig;
447 	bh->b_rsector = bh_orig->b_rsector;
448 #ifdef HIGHMEM_DEBUG
449 	memset(&bh->b_wait, -1, sizeof(bh->b_wait));
450 #endif
451 
452 	return bh;
453 }
454 
455