1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Page table allocation functions
4  *
5  *    Copyright IBM Corp. 2016
6  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
7  */
8 
9 #include <linux/sysctl.h>
10 #include <linux/slab.h>
11 #include <linux/mm.h>
12 #include <asm/mmu_context.h>
13 #include <asm/pgalloc.h>
14 #include <asm/gmap.h>
15 #include <asm/tlb.h>
16 #include <asm/tlbflush.h>
17 
18 #ifdef CONFIG_PGSTE
19 
20 int page_table_allocate_pgste = 0;
21 EXPORT_SYMBOL(page_table_allocate_pgste);
22 
23 static struct ctl_table page_table_sysctl[] = {
24 	{
25 		.procname	= "allocate_pgste",
26 		.data		= &page_table_allocate_pgste,
27 		.maxlen		= sizeof(int),
28 		.mode		= S_IRUGO | S_IWUSR,
29 		.proc_handler	= proc_dointvec_minmax,
30 		.extra1		= SYSCTL_ZERO,
31 		.extra2		= SYSCTL_ONE,
32 	},
33 	{ }
34 };
35 
36 static struct ctl_table page_table_sysctl_dir[] = {
37 	{
38 		.procname	= "vm",
39 		.maxlen		= 0,
40 		.mode		= 0555,
41 		.child		= page_table_sysctl,
42 	},
43 	{ }
44 };
45 
page_table_register_sysctl(void)46 static int __init page_table_register_sysctl(void)
47 {
48 	return register_sysctl_table(page_table_sysctl_dir) ? 0 : -ENOMEM;
49 }
50 __initcall(page_table_register_sysctl);
51 
52 #endif /* CONFIG_PGSTE */
53 
crst_table_alloc(struct mm_struct * mm)54 unsigned long *crst_table_alloc(struct mm_struct *mm)
55 {
56 	struct page *page = alloc_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
57 
58 	if (!page)
59 		return NULL;
60 	arch_set_page_dat(page, CRST_ALLOC_ORDER);
61 	return (unsigned long *) page_to_virt(page);
62 }
63 
crst_table_free(struct mm_struct * mm,unsigned long * table)64 void crst_table_free(struct mm_struct *mm, unsigned long *table)
65 {
66 	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
67 }
68 
__crst_table_upgrade(void * arg)69 static void __crst_table_upgrade(void *arg)
70 {
71 	struct mm_struct *mm = arg;
72 
73 	/* change all active ASCEs to avoid the creation of new TLBs */
74 	if (current->active_mm == mm) {
75 		S390_lowcore.user_asce = mm->context.asce;
76 		__ctl_load(S390_lowcore.user_asce, 7, 7);
77 	}
78 	__tlb_flush_local();
79 }
80 
crst_table_upgrade(struct mm_struct * mm,unsigned long end)81 int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
82 {
83 	unsigned long *pgd = NULL, *p4d = NULL, *__pgd;
84 	unsigned long asce_limit = mm->context.asce_limit;
85 
86 	/* upgrade should only happen from 3 to 4, 3 to 5, or 4 to 5 levels */
87 	VM_BUG_ON(asce_limit < _REGION2_SIZE);
88 
89 	if (end <= asce_limit)
90 		return 0;
91 
92 	if (asce_limit == _REGION2_SIZE) {
93 		p4d = crst_table_alloc(mm);
94 		if (unlikely(!p4d))
95 			goto err_p4d;
96 		crst_table_init(p4d, _REGION2_ENTRY_EMPTY);
97 	}
98 	if (end > _REGION1_SIZE) {
99 		pgd = crst_table_alloc(mm);
100 		if (unlikely(!pgd))
101 			goto err_pgd;
102 		crst_table_init(pgd, _REGION1_ENTRY_EMPTY);
103 	}
104 
105 	spin_lock_bh(&mm->page_table_lock);
106 
107 	/*
108 	 * This routine gets called with mmap_lock lock held and there is
109 	 * no reason to optimize for the case of otherwise. However, if
110 	 * that would ever change, the below check will let us know.
111 	 */
112 	VM_BUG_ON(asce_limit != mm->context.asce_limit);
113 
114 	if (p4d) {
115 		__pgd = (unsigned long *) mm->pgd;
116 		p4d_populate(mm, (p4d_t *) p4d, (pud_t *) __pgd);
117 		mm->pgd = (pgd_t *) p4d;
118 		mm->context.asce_limit = _REGION1_SIZE;
119 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
120 			_ASCE_USER_BITS | _ASCE_TYPE_REGION2;
121 		mm_inc_nr_puds(mm);
122 	}
123 	if (pgd) {
124 		__pgd = (unsigned long *) mm->pgd;
125 		pgd_populate(mm, (pgd_t *) pgd, (p4d_t *) __pgd);
126 		mm->pgd = (pgd_t *) pgd;
127 		mm->context.asce_limit = TASK_SIZE_MAX;
128 		mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
129 			_ASCE_USER_BITS | _ASCE_TYPE_REGION1;
130 	}
131 
132 	spin_unlock_bh(&mm->page_table_lock);
133 
134 	on_each_cpu(__crst_table_upgrade, mm, 0);
135 
136 	return 0;
137 
138 err_pgd:
139 	crst_table_free(mm, p4d);
140 err_p4d:
141 	return -ENOMEM;
142 }
143 
atomic_xor_bits(atomic_t * v,unsigned int bits)144 static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
145 {
146 	unsigned int old, new;
147 
148 	do {
149 		old = atomic_read(v);
150 		new = old ^ bits;
151 	} while (atomic_cmpxchg(v, old, new) != old);
152 	return new;
153 }
154 
155 #ifdef CONFIG_PGSTE
156 
page_table_alloc_pgste(struct mm_struct * mm)157 struct page *page_table_alloc_pgste(struct mm_struct *mm)
158 {
159 	struct page *page;
160 	u64 *table;
161 
162 	page = alloc_page(GFP_KERNEL);
163 	if (page) {
164 		table = (u64 *)page_to_virt(page);
165 		memset64(table, _PAGE_INVALID, PTRS_PER_PTE);
166 		memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
167 	}
168 	return page;
169 }
170 
page_table_free_pgste(struct page * page)171 void page_table_free_pgste(struct page *page)
172 {
173 	__free_page(page);
174 }
175 
176 #endif /* CONFIG_PGSTE */
177 
178 /*
179  * A 2KB-pgtable is either upper or lower half of a normal page.
180  * The second half of the page may be unused or used as another
181  * 2KB-pgtable.
182  *
183  * Whenever possible the parent page for a new 2KB-pgtable is picked
184  * from the list of partially allocated pages mm_context_t::pgtable_list.
185  * In case the list is empty a new parent page is allocated and added to
186  * the list.
187  *
188  * When a parent page gets fully allocated it contains 2KB-pgtables in both
189  * upper and lower halves and is removed from mm_context_t::pgtable_list.
190  *
191  * When 2KB-pgtable is freed from to fully allocated parent page that
192  * page turns partially allocated and added to mm_context_t::pgtable_list.
193  *
194  * If 2KB-pgtable is freed from the partially allocated parent page that
195  * page turns unused and gets removed from mm_context_t::pgtable_list.
196  * Furthermore, the unused parent page is released.
197  *
198  * As follows from the above, no unallocated or fully allocated parent
199  * pages are contained in mm_context_t::pgtable_list.
200  *
201  * The upper byte (bits 24-31) of the parent page _refcount is used
202  * for tracking contained 2KB-pgtables and has the following format:
203  *
204  *   PP  AA
205  * 01234567    upper byte (bits 24-31) of struct page::_refcount
206  *   ||  ||
207  *   ||  |+--- upper 2KB-pgtable is allocated
208  *   ||  +---- lower 2KB-pgtable is allocated
209  *   |+------- upper 2KB-pgtable is pending for removal
210  *   +-------- lower 2KB-pgtable is pending for removal
211  *
212  * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
213  * using _refcount is possible).
214  *
215  * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
216  * The parent page is either:
217  *   - added to mm_context_t::pgtable_list in case the second half of the
218  *     parent page is still unallocated;
219  *   - removed from mm_context_t::pgtable_list in case both hales of the
220  *     parent page are allocated;
221  * These operations are protected with mm_context_t::lock.
222  *
223  * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
224  * and the corresponding PP bit is set to 1 in a single atomic operation.
225  * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
226  * exclusive and may never be both set to 1!
227  * The parent page is either:
228  *   - added to mm_context_t::pgtable_list in case the second half of the
229  *     parent page is still allocated;
230  *   - removed from mm_context_t::pgtable_list in case the second half of
231  *     the parent page is unallocated;
232  * These operations are protected with mm_context_t::lock.
233  *
234  * It is important to understand that mm_context_t::lock only protects
235  * mm_context_t::pgtable_list and AA bits, but not the parent page itself
236  * and PP bits.
237  *
238  * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
239  * while both AA bits and the second PP bit are already unset. Then the
240  * parent page does not contain any 2KB-pgtable fragment anymore, and it has
241  * also been removed from mm_context_t::pgtable_list. It is safe to release
242  * the page therefore.
243  *
244  * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
245  * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
246  * while the PP bits are never used, nor such a page is added to or removed
247  * from mm_context_t::pgtable_list.
248  */
page_table_alloc(struct mm_struct * mm)249 unsigned long *page_table_alloc(struct mm_struct *mm)
250 {
251 	unsigned long *table;
252 	struct page *page;
253 	unsigned int mask, bit;
254 
255 	/* Try to get a fragment of a 4K page as a 2K page table */
256 	if (!mm_alloc_pgste(mm)) {
257 		table = NULL;
258 		spin_lock_bh(&mm->context.lock);
259 		if (!list_empty(&mm->context.pgtable_list)) {
260 			page = list_first_entry(&mm->context.pgtable_list,
261 						struct page, lru);
262 			mask = atomic_read(&page->_refcount) >> 24;
263 			/*
264 			 * The pending removal bits must also be checked.
265 			 * Failure to do so might lead to an impossible
266 			 * value of (i.e 0x13 or 0x23) written to _refcount.
267 			 * Such values violate the assumption that pending and
268 			 * allocation bits are mutually exclusive, and the rest
269 			 * of the code unrails as result. That could lead to
270 			 * a whole bunch of races and corruptions.
271 			 */
272 			mask = (mask | (mask >> 4)) & 0x03U;
273 			if (mask != 0x03U) {
274 				table = (unsigned long *) page_to_virt(page);
275 				bit = mask & 1;		/* =1 -> second 2K */
276 				if (bit)
277 					table += PTRS_PER_PTE;
278 				atomic_xor_bits(&page->_refcount,
279 							0x01U << (bit + 24));
280 				list_del(&page->lru);
281 			}
282 		}
283 		spin_unlock_bh(&mm->context.lock);
284 		if (table)
285 			return table;
286 	}
287 	/* Allocate a fresh page */
288 	page = alloc_page(GFP_KERNEL);
289 	if (!page)
290 		return NULL;
291 	if (!pgtable_pte_page_ctor(page)) {
292 		__free_page(page);
293 		return NULL;
294 	}
295 	arch_set_page_dat(page, 0);
296 	/* Initialize page table */
297 	table = (unsigned long *) page_to_virt(page);
298 	if (mm_alloc_pgste(mm)) {
299 		/* Return 4K page table with PGSTEs */
300 		atomic_xor_bits(&page->_refcount, 0x03U << 24);
301 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
302 		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
303 	} else {
304 		/* Return the first 2K fragment of the page */
305 		atomic_xor_bits(&page->_refcount, 0x01U << 24);
306 		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
307 		spin_lock_bh(&mm->context.lock);
308 		list_add(&page->lru, &mm->context.pgtable_list);
309 		spin_unlock_bh(&mm->context.lock);
310 	}
311 	return table;
312 }
313 
page_table_release_check(struct page * page,void * table,unsigned int half,unsigned int mask)314 static void page_table_release_check(struct page *page, void *table,
315 				     unsigned int half, unsigned int mask)
316 {
317 	char msg[128];
318 
319 	if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask)
320 		return;
321 	snprintf(msg, sizeof(msg),
322 		 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
323 		 table, half, mask);
324 	dump_page(page, msg);
325 }
326 
page_table_free(struct mm_struct * mm,unsigned long * table)327 void page_table_free(struct mm_struct *mm, unsigned long *table)
328 {
329 	unsigned int mask, bit, half;
330 	struct page *page;
331 
332 	page = virt_to_page(table);
333 	if (!mm_alloc_pgste(mm)) {
334 		/* Free 2K page table fragment of a 4K page */
335 		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
336 		spin_lock_bh(&mm->context.lock);
337 		/*
338 		 * Mark the page for delayed release. The actual release
339 		 * will happen outside of the critical section from this
340 		 * function or from __tlb_remove_table()
341 		 */
342 		mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
343 		mask >>= 24;
344 		if (mask & 0x03U)
345 			list_add(&page->lru, &mm->context.pgtable_list);
346 		else
347 			list_del(&page->lru);
348 		spin_unlock_bh(&mm->context.lock);
349 		mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24));
350 		mask >>= 24;
351 		if (mask != 0x00U)
352 			return;
353 		half = 0x01U << bit;
354 	} else {
355 		half = 0x03U;
356 		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
357 		mask >>= 24;
358 	}
359 
360 	page_table_release_check(page, table, half, mask);
361 	pgtable_pte_page_dtor(page);
362 	__free_page(page);
363 }
364 
page_table_free_rcu(struct mmu_gather * tlb,unsigned long * table,unsigned long vmaddr)365 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
366 			 unsigned long vmaddr)
367 {
368 	struct mm_struct *mm;
369 	struct page *page;
370 	unsigned int bit, mask;
371 
372 	mm = tlb->mm;
373 	page = virt_to_page(table);
374 	if (mm_alloc_pgste(mm)) {
375 		gmap_unlink(mm, table, vmaddr);
376 		table = (unsigned long *) ((unsigned long)table | 0x03U);
377 		tlb_remove_table(tlb, table);
378 		return;
379 	}
380 	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
381 	spin_lock_bh(&mm->context.lock);
382 	/*
383 	 * Mark the page for delayed release. The actual release will happen
384 	 * outside of the critical section from __tlb_remove_table() or from
385 	 * page_table_free()
386 	 */
387 	mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24));
388 	mask >>= 24;
389 	if (mask & 0x03U)
390 		list_add_tail(&page->lru, &mm->context.pgtable_list);
391 	else
392 		list_del(&page->lru);
393 	spin_unlock_bh(&mm->context.lock);
394 	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
395 	tlb_remove_table(tlb, table);
396 }
397 
__tlb_remove_table(void * _table)398 void __tlb_remove_table(void *_table)
399 {
400 	unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
401 	void *table = (void *)((unsigned long) _table ^ mask);
402 	struct page *page = virt_to_page(table);
403 
404 	switch (half) {
405 	case 0x00U:	/* pmd, pud, or p4d */
406 		free_pages((unsigned long)table, CRST_ALLOC_ORDER);
407 		return;
408 	case 0x01U:	/* lower 2K of a 4K page table */
409 	case 0x02U:	/* higher 2K of a 4K page table */
410 		mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24));
411 		mask >>= 24;
412 		if (mask != 0x00U)
413 			return;
414 		break;
415 	case 0x03U:	/* 4K page table with pgstes */
416 		mask = atomic_xor_bits(&page->_refcount, 0x03U << 24);
417 		mask >>= 24;
418 		break;
419 	}
420 
421 	page_table_release_check(page, table, half, mask);
422 	pgtable_pte_page_dtor(page);
423 	__free_page(page);
424 }
425 
426 /*
427  * Base infrastructure required to generate basic asces, region, segment,
428  * and page tables that do not make use of enhanced features like EDAT1.
429  */
430 
431 static struct kmem_cache *base_pgt_cache;
432 
base_pgt_alloc(void)433 static unsigned long *base_pgt_alloc(void)
434 {
435 	unsigned long *table;
436 
437 	table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL);
438 	if (table)
439 		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
440 	return table;
441 }
442 
base_pgt_free(unsigned long * table)443 static void base_pgt_free(unsigned long *table)
444 {
445 	kmem_cache_free(base_pgt_cache, table);
446 }
447 
base_crst_alloc(unsigned long val)448 static unsigned long *base_crst_alloc(unsigned long val)
449 {
450 	unsigned long *table;
451 
452 	table =	(unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER);
453 	if (table)
454 		crst_table_init(table, val);
455 	return table;
456 }
457 
base_crst_free(unsigned long * table)458 static void base_crst_free(unsigned long *table)
459 {
460 	free_pages((unsigned long)table, CRST_ALLOC_ORDER);
461 }
462 
463 #define BASE_ADDR_END_FUNC(NAME, SIZE)					\
464 static inline unsigned long base_##NAME##_addr_end(unsigned long addr,	\
465 						   unsigned long end)	\
466 {									\
467 	unsigned long next = (addr + (SIZE)) & ~((SIZE) - 1);		\
468 									\
469 	return (next - 1) < (end - 1) ? next : end;			\
470 }
471 
BASE_ADDR_END_FUNC(page,_PAGE_SIZE)472 BASE_ADDR_END_FUNC(page,    _PAGE_SIZE)
473 BASE_ADDR_END_FUNC(segment, _SEGMENT_SIZE)
474 BASE_ADDR_END_FUNC(region3, _REGION3_SIZE)
475 BASE_ADDR_END_FUNC(region2, _REGION2_SIZE)
476 BASE_ADDR_END_FUNC(region1, _REGION1_SIZE)
477 
478 static inline unsigned long base_lra(unsigned long address)
479 {
480 	unsigned long real;
481 
482 	asm volatile(
483 		"	lra	%0,0(%1)\n"
484 		: "=d" (real) : "a" (address) : "cc");
485 	return real;
486 }
487 
base_page_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)488 static int base_page_walk(unsigned long *origin, unsigned long addr,
489 			  unsigned long end, int alloc)
490 {
491 	unsigned long *pte, next;
492 
493 	if (!alloc)
494 		return 0;
495 	pte = origin;
496 	pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT;
497 	do {
498 		next = base_page_addr_end(addr, end);
499 		*pte = base_lra(addr);
500 	} while (pte++, addr = next, addr < end);
501 	return 0;
502 }
503 
base_segment_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)504 static int base_segment_walk(unsigned long *origin, unsigned long addr,
505 			     unsigned long end, int alloc)
506 {
507 	unsigned long *ste, next, *table;
508 	int rc;
509 
510 	ste = origin;
511 	ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
512 	do {
513 		next = base_segment_addr_end(addr, end);
514 		if (*ste & _SEGMENT_ENTRY_INVALID) {
515 			if (!alloc)
516 				continue;
517 			table = base_pgt_alloc();
518 			if (!table)
519 				return -ENOMEM;
520 			*ste = __pa(table) | _SEGMENT_ENTRY;
521 		}
522 		table = __va(*ste & _SEGMENT_ENTRY_ORIGIN);
523 		rc = base_page_walk(table, addr, next, alloc);
524 		if (rc)
525 			return rc;
526 		if (!alloc)
527 			base_pgt_free(table);
528 		cond_resched();
529 	} while (ste++, addr = next, addr < end);
530 	return 0;
531 }
532 
base_region3_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)533 static int base_region3_walk(unsigned long *origin, unsigned long addr,
534 			     unsigned long end, int alloc)
535 {
536 	unsigned long *rtte, next, *table;
537 	int rc;
538 
539 	rtte = origin;
540 	rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT;
541 	do {
542 		next = base_region3_addr_end(addr, end);
543 		if (*rtte & _REGION_ENTRY_INVALID) {
544 			if (!alloc)
545 				continue;
546 			table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
547 			if (!table)
548 				return -ENOMEM;
549 			*rtte = __pa(table) | _REGION3_ENTRY;
550 		}
551 		table = __va(*rtte & _REGION_ENTRY_ORIGIN);
552 		rc = base_segment_walk(table, addr, next, alloc);
553 		if (rc)
554 			return rc;
555 		if (!alloc)
556 			base_crst_free(table);
557 	} while (rtte++, addr = next, addr < end);
558 	return 0;
559 }
560 
base_region2_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)561 static int base_region2_walk(unsigned long *origin, unsigned long addr,
562 			     unsigned long end, int alloc)
563 {
564 	unsigned long *rste, next, *table;
565 	int rc;
566 
567 	rste = origin;
568 	rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT;
569 	do {
570 		next = base_region2_addr_end(addr, end);
571 		if (*rste & _REGION_ENTRY_INVALID) {
572 			if (!alloc)
573 				continue;
574 			table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
575 			if (!table)
576 				return -ENOMEM;
577 			*rste = __pa(table) | _REGION2_ENTRY;
578 		}
579 		table = __va(*rste & _REGION_ENTRY_ORIGIN);
580 		rc = base_region3_walk(table, addr, next, alloc);
581 		if (rc)
582 			return rc;
583 		if (!alloc)
584 			base_crst_free(table);
585 	} while (rste++, addr = next, addr < end);
586 	return 0;
587 }
588 
base_region1_walk(unsigned long * origin,unsigned long addr,unsigned long end,int alloc)589 static int base_region1_walk(unsigned long *origin, unsigned long addr,
590 			     unsigned long end, int alloc)
591 {
592 	unsigned long *rfte, next, *table;
593 	int rc;
594 
595 	rfte = origin;
596 	rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT;
597 	do {
598 		next = base_region1_addr_end(addr, end);
599 		if (*rfte & _REGION_ENTRY_INVALID) {
600 			if (!alloc)
601 				continue;
602 			table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
603 			if (!table)
604 				return -ENOMEM;
605 			*rfte = __pa(table) | _REGION1_ENTRY;
606 		}
607 		table = __va(*rfte & _REGION_ENTRY_ORIGIN);
608 		rc = base_region2_walk(table, addr, next, alloc);
609 		if (rc)
610 			return rc;
611 		if (!alloc)
612 			base_crst_free(table);
613 	} while (rfte++, addr = next, addr < end);
614 	return 0;
615 }
616 
617 /**
618  * base_asce_free - free asce and tables returned from base_asce_alloc()
619  * @asce: asce to be freed
620  *
621  * Frees all region, segment, and page tables that were allocated with a
622  * corresponding base_asce_alloc() call.
623  */
base_asce_free(unsigned long asce)624 void base_asce_free(unsigned long asce)
625 {
626 	unsigned long *table = __va(asce & _ASCE_ORIGIN);
627 
628 	if (!asce)
629 		return;
630 	switch (asce & _ASCE_TYPE_MASK) {
631 	case _ASCE_TYPE_SEGMENT:
632 		base_segment_walk(table, 0, _REGION3_SIZE, 0);
633 		break;
634 	case _ASCE_TYPE_REGION3:
635 		base_region3_walk(table, 0, _REGION2_SIZE, 0);
636 		break;
637 	case _ASCE_TYPE_REGION2:
638 		base_region2_walk(table, 0, _REGION1_SIZE, 0);
639 		break;
640 	case _ASCE_TYPE_REGION1:
641 		base_region1_walk(table, 0, TASK_SIZE_MAX, 0);
642 		break;
643 	}
644 	base_crst_free(table);
645 }
646 
base_pgt_cache_init(void)647 static int base_pgt_cache_init(void)
648 {
649 	static DEFINE_MUTEX(base_pgt_cache_mutex);
650 	unsigned long sz = _PAGE_TABLE_SIZE;
651 
652 	if (base_pgt_cache)
653 		return 0;
654 	mutex_lock(&base_pgt_cache_mutex);
655 	if (!base_pgt_cache)
656 		base_pgt_cache = kmem_cache_create("base_pgt", sz, sz, 0, NULL);
657 	mutex_unlock(&base_pgt_cache_mutex);
658 	return base_pgt_cache ? 0 : -ENOMEM;
659 }
660 
661 /**
662  * base_asce_alloc - create kernel mapping without enhanced DAT features
663  * @addr: virtual start address of kernel mapping
664  * @num_pages: number of consecutive pages
665  *
666  * Generate an asce, including all required region, segment and page tables,
667  * that can be used to access the virtual kernel mapping. The difference is
668  * that the returned asce does not make use of any enhanced DAT features like
669  * e.g. large pages. This is required for some I/O functions that pass an
670  * asce, like e.g. some service call requests.
671  *
672  * Note: the returned asce may NEVER be attached to any cpu. It may only be
673  *	 used for I/O requests. tlb entries that might result because the
674  *	 asce was attached to a cpu won't be cleared.
675  */
base_asce_alloc(unsigned long addr,unsigned long num_pages)676 unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages)
677 {
678 	unsigned long asce, *table, end;
679 	int rc;
680 
681 	if (base_pgt_cache_init())
682 		return 0;
683 	end = addr + num_pages * PAGE_SIZE;
684 	if (end <= _REGION3_SIZE) {
685 		table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY);
686 		if (!table)
687 			return 0;
688 		rc = base_segment_walk(table, addr, end, 1);
689 		asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH;
690 	} else if (end <= _REGION2_SIZE) {
691 		table = base_crst_alloc(_REGION3_ENTRY_EMPTY);
692 		if (!table)
693 			return 0;
694 		rc = base_region3_walk(table, addr, end, 1);
695 		asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
696 	} else if (end <= _REGION1_SIZE) {
697 		table = base_crst_alloc(_REGION2_ENTRY_EMPTY);
698 		if (!table)
699 			return 0;
700 		rc = base_region2_walk(table, addr, end, 1);
701 		asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH;
702 	} else {
703 		table = base_crst_alloc(_REGION1_ENTRY_EMPTY);
704 		if (!table)
705 			return 0;
706 		rc = base_region1_walk(table, addr, end, 1);
707 		asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH;
708 	}
709 	if (rc) {
710 		base_asce_free(asce);
711 		asce = 0;
712 	}
713 	return asce;
714 }
715