1 /*
2  *    Copyright IBM Corp. 2007,2009
3  *    Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4  */
5 
6 #include <linux/sched.h>
7 #include <linux/kernel.h>
8 #include <linux/errno.h>
9 #include <linux/gfp.h>
10 #include <linux/mm.h>
11 #include <linux/swap.h>
12 #include <linux/smp.h>
13 #include <linux/highmem.h>
14 #include <linux/pagemap.h>
15 #include <linux/spinlock.h>
16 #include <linux/module.h>
17 #include <linux/quicklist.h>
18 #include <linux/rcupdate.h>
19 
20 #include <asm/system.h>
21 #include <asm/pgtable.h>
22 #include <asm/pgalloc.h>
23 #include <asm/tlb.h>
24 #include <asm/tlbflush.h>
25 #include <asm/mmu_context.h>
26 
27 struct rcu_table_freelist {
28 	struct rcu_head rcu;
29 	struct mm_struct *mm;
30 	unsigned int pgt_index;
31 	unsigned int crst_index;
32 	unsigned long *table[0];
33 };
34 
35 #define RCU_FREELIST_SIZE \
36 	((PAGE_SIZE - sizeof(struct rcu_table_freelist)) \
37 	  / sizeof(unsigned long))
38 
39 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
40 static DEFINE_PER_CPU(struct rcu_table_freelist *, rcu_table_freelist);
41 
42 static void __page_table_free(struct mm_struct *mm, unsigned long *table);
43 static void __crst_table_free(struct mm_struct *mm, unsigned long *table);
44 
rcu_table_freelist_get(struct mm_struct * mm)45 static struct rcu_table_freelist *rcu_table_freelist_get(struct mm_struct *mm)
46 {
47 	struct rcu_table_freelist **batchp = &__get_cpu_var(rcu_table_freelist);
48 	struct rcu_table_freelist *batch = *batchp;
49 
50 	if (batch)
51 		return batch;
52 	batch = (struct rcu_table_freelist *) __get_free_page(GFP_ATOMIC);
53 	if (batch) {
54 		batch->mm = mm;
55 		batch->pgt_index = 0;
56 		batch->crst_index = RCU_FREELIST_SIZE;
57 		*batchp = batch;
58 	}
59 	return batch;
60 }
61 
rcu_table_freelist_callback(struct rcu_head * head)62 static void rcu_table_freelist_callback(struct rcu_head *head)
63 {
64 	struct rcu_table_freelist *batch =
65 		container_of(head, struct rcu_table_freelist, rcu);
66 
67 	while (batch->pgt_index > 0)
68 		__page_table_free(batch->mm, batch->table[--batch->pgt_index]);
69 	while (batch->crst_index < RCU_FREELIST_SIZE)
70 		__crst_table_free(batch->mm, batch->table[batch->crst_index++]);
71 	free_page((unsigned long) batch);
72 }
73 
rcu_table_freelist_finish(void)74 void rcu_table_freelist_finish(void)
75 {
76 	struct rcu_table_freelist *batch = __get_cpu_var(rcu_table_freelist);
77 
78 	if (!batch)
79 		return;
80 	call_rcu(&batch->rcu, rcu_table_freelist_callback);
81 	__get_cpu_var(rcu_table_freelist) = NULL;
82 }
83 
smp_sync(void * arg)84 static void smp_sync(void *arg)
85 {
86 }
87 
88 #ifndef CONFIG_64BIT
89 #define ALLOC_ORDER	1
90 #define TABLES_PER_PAGE	4
91 #define FRAG_MASK	15UL
92 #define SECOND_HALVES	10UL
93 
clear_table_pgstes(unsigned long * table)94 void clear_table_pgstes(unsigned long *table)
95 {
96 	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
97 	memset(table + 256, 0, PAGE_SIZE/4);
98 	clear_table(table + 512, _PAGE_TYPE_EMPTY, PAGE_SIZE/4);
99 	memset(table + 768, 0, PAGE_SIZE/4);
100 }
101 
102 #else
103 #define ALLOC_ORDER	2
104 #define TABLES_PER_PAGE	2
105 #define FRAG_MASK	3UL
106 #define SECOND_HALVES	2UL
107 
clear_table_pgstes(unsigned long * table)108 void clear_table_pgstes(unsigned long *table)
109 {
110 	clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
111 	memset(table + 256, 0, PAGE_SIZE/2);
112 }
113 
114 #endif
115 
116 unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
117 EXPORT_SYMBOL(VMALLOC_START);
118 
parse_vmalloc(char * arg)119 static int __init parse_vmalloc(char *arg)
120 {
121 	if (!arg)
122 		return -EINVAL;
123 	VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
124 	return 0;
125 }
126 early_param("vmalloc", parse_vmalloc);
127 
crst_table_alloc(struct mm_struct * mm,int noexec)128 unsigned long *crst_table_alloc(struct mm_struct *mm, int noexec)
129 {
130 	struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
131 
132 	if (!page)
133 		return NULL;
134 	page->index = 0;
135 	if (noexec) {
136 		struct page *shadow = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
137 		if (!shadow) {
138 			__free_pages(page, ALLOC_ORDER);
139 			return NULL;
140 		}
141 		page->index = page_to_phys(shadow);
142 	}
143 	spin_lock_bh(&mm->context.list_lock);
144 	list_add(&page->lru, &mm->context.crst_list);
145 	spin_unlock_bh(&mm->context.list_lock);
146 	return (unsigned long *) page_to_phys(page);
147 }
148 
__crst_table_free(struct mm_struct * mm,unsigned long * table)149 static void __crst_table_free(struct mm_struct *mm, unsigned long *table)
150 {
151 	unsigned long *shadow = get_shadow_table(table);
152 
153 	if (shadow)
154 		free_pages((unsigned long) shadow, ALLOC_ORDER);
155 	free_pages((unsigned long) table, ALLOC_ORDER);
156 }
157 
crst_table_free(struct mm_struct * mm,unsigned long * table)158 void crst_table_free(struct mm_struct *mm, unsigned long *table)
159 {
160 	struct page *page = virt_to_page(table);
161 
162 	spin_lock_bh(&mm->context.list_lock);
163 	list_del(&page->lru);
164 	spin_unlock_bh(&mm->context.list_lock);
165 	__crst_table_free(mm, table);
166 }
167 
crst_table_free_rcu(struct mm_struct * mm,unsigned long * table)168 void crst_table_free_rcu(struct mm_struct *mm, unsigned long *table)
169 {
170 	struct rcu_table_freelist *batch;
171 	struct page *page = virt_to_page(table);
172 
173 	spin_lock_bh(&mm->context.list_lock);
174 	list_del(&page->lru);
175 	spin_unlock_bh(&mm->context.list_lock);
176 	if (atomic_read(&mm->mm_users) < 2 &&
177 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
178 		__crst_table_free(mm, table);
179 		return;
180 	}
181 	batch = rcu_table_freelist_get(mm);
182 	if (!batch) {
183 		smp_call_function(smp_sync, NULL, 1);
184 		__crst_table_free(mm, table);
185 		return;
186 	}
187 	batch->table[--batch->crst_index] = table;
188 	if (batch->pgt_index >= batch->crst_index)
189 		rcu_table_freelist_finish();
190 }
191 
192 #ifdef CONFIG_64BIT
crst_table_upgrade(struct mm_struct * mm,unsigned long limit)193 int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
194 {
195 	unsigned long *table, *pgd;
196 	unsigned long entry;
197 
198 	BUG_ON(limit > (1UL << 53));
199 repeat:
200 	table = crst_table_alloc(mm, mm->context.noexec);
201 	if (!table)
202 		return -ENOMEM;
203 	spin_lock_bh(&mm->page_table_lock);
204 	if (mm->context.asce_limit < limit) {
205 		pgd = (unsigned long *) mm->pgd;
206 		if (mm->context.asce_limit <= (1UL << 31)) {
207 			entry = _REGION3_ENTRY_EMPTY;
208 			mm->context.asce_limit = 1UL << 42;
209 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
210 						_ASCE_USER_BITS |
211 						_ASCE_TYPE_REGION3;
212 		} else {
213 			entry = _REGION2_ENTRY_EMPTY;
214 			mm->context.asce_limit = 1UL << 53;
215 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
216 						_ASCE_USER_BITS |
217 						_ASCE_TYPE_REGION2;
218 		}
219 		crst_table_init(table, entry);
220 		pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
221 		mm->pgd = (pgd_t *) table;
222 		mm->task_size = mm->context.asce_limit;
223 		table = NULL;
224 	}
225 	spin_unlock_bh(&mm->page_table_lock);
226 	if (table)
227 		crst_table_free(mm, table);
228 	if (mm->context.asce_limit < limit)
229 		goto repeat;
230 	update_mm(mm, current);
231 	return 0;
232 }
233 
crst_table_downgrade(struct mm_struct * mm,unsigned long limit)234 void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
235 {
236 	pgd_t *pgd;
237 
238 	if (mm->context.asce_limit <= limit)
239 		return;
240 	__tlb_flush_mm(mm);
241 	while (mm->context.asce_limit > limit) {
242 		pgd = mm->pgd;
243 		switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
244 		case _REGION_ENTRY_TYPE_R2:
245 			mm->context.asce_limit = 1UL << 42;
246 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
247 						_ASCE_USER_BITS |
248 						_ASCE_TYPE_REGION3;
249 			break;
250 		case _REGION_ENTRY_TYPE_R3:
251 			mm->context.asce_limit = 1UL << 31;
252 			mm->context.asce_bits = _ASCE_TABLE_LENGTH |
253 						_ASCE_USER_BITS |
254 						_ASCE_TYPE_SEGMENT;
255 			break;
256 		default:
257 			BUG();
258 		}
259 		mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
260 		mm->task_size = mm->context.asce_limit;
261 		crst_table_free(mm, (unsigned long *) pgd);
262 	}
263 	update_mm(mm, current);
264 }
265 #endif
266 
267 /*
268  * page table entry allocation/free routines.
269  */
page_table_alloc(struct mm_struct * mm)270 unsigned long *page_table_alloc(struct mm_struct *mm)
271 {
272 	struct page *page;
273 	unsigned long *table;
274 	unsigned long bits;
275 
276 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
277 	spin_lock_bh(&mm->context.list_lock);
278 	page = NULL;
279 	if (!list_empty(&mm->context.pgtable_list)) {
280 		page = list_first_entry(&mm->context.pgtable_list,
281 					struct page, lru);
282 		if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
283 			page = NULL;
284 	}
285 	if (!page) {
286 		spin_unlock_bh(&mm->context.list_lock);
287 		page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
288 		if (!page)
289 			return NULL;
290 		pgtable_page_ctor(page);
291 		page->flags &= ~FRAG_MASK;
292 		table = (unsigned long *) page_to_phys(page);
293 		if (mm->context.has_pgste)
294 			clear_table_pgstes(table);
295 		else
296 			clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
297 		spin_lock_bh(&mm->context.list_lock);
298 		list_add(&page->lru, &mm->context.pgtable_list);
299 	}
300 	table = (unsigned long *) page_to_phys(page);
301 	while (page->flags & bits) {
302 		table += 256;
303 		bits <<= 1;
304 	}
305 	page->flags |= bits;
306 	if ((page->flags & FRAG_MASK) == ((1UL << TABLES_PER_PAGE) - 1))
307 		list_move_tail(&page->lru, &mm->context.pgtable_list);
308 	spin_unlock_bh(&mm->context.list_lock);
309 	return table;
310 }
311 
__page_table_free(struct mm_struct * mm,unsigned long * table)312 static void __page_table_free(struct mm_struct *mm, unsigned long *table)
313 {
314 	struct page *page;
315 	unsigned long bits;
316 
317 	bits = ((unsigned long) table) & 15;
318 	table = (unsigned long *)(((unsigned long) table) ^ bits);
319 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
320 	page->flags ^= bits;
321 	if (!(page->flags & FRAG_MASK)) {
322 		pgtable_page_dtor(page);
323 		__free_page(page);
324 	}
325 }
326 
page_table_free(struct mm_struct * mm,unsigned long * table)327 void page_table_free(struct mm_struct *mm, unsigned long *table)
328 {
329 	struct page *page;
330 	unsigned long bits;
331 
332 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
333 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
334 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
335 	spin_lock_bh(&mm->context.list_lock);
336 	page->flags ^= bits;
337 	if (page->flags & FRAG_MASK) {
338 		/* Page now has some free pgtable fragments. */
339 		if (!list_empty(&page->lru))
340 			list_move(&page->lru, &mm->context.pgtable_list);
341 		page = NULL;
342 	} else
343 		/* All fragments of the 4K page have been freed. */
344 		list_del(&page->lru);
345 	spin_unlock_bh(&mm->context.list_lock);
346 	if (page) {
347 		pgtable_page_dtor(page);
348 		__free_page(page);
349 	}
350 }
351 
page_table_free_rcu(struct mm_struct * mm,unsigned long * table)352 void page_table_free_rcu(struct mm_struct *mm, unsigned long *table)
353 {
354 	struct rcu_table_freelist *batch;
355 	struct page *page;
356 	unsigned long bits;
357 
358 	if (atomic_read(&mm->mm_users) < 2 &&
359 	    cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) {
360 		page_table_free(mm, table);
361 		return;
362 	}
363 	batch = rcu_table_freelist_get(mm);
364 	if (!batch) {
365 		smp_call_function(smp_sync, NULL, 1);
366 		page_table_free(mm, table);
367 		return;
368 	}
369 	bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
370 	bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
371 	page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
372 	spin_lock_bh(&mm->context.list_lock);
373 	/* Delayed freeing with rcu prevents reuse of pgtable fragments */
374 	list_del_init(&page->lru);
375 	spin_unlock_bh(&mm->context.list_lock);
376 	table = (unsigned long *)(((unsigned long) table) | bits);
377 	batch->table[batch->pgt_index++] = table;
378 	if (batch->pgt_index >= batch->crst_index)
379 		rcu_table_freelist_finish();
380 }
381 
disable_noexec(struct mm_struct * mm,struct task_struct * tsk)382 void disable_noexec(struct mm_struct *mm, struct task_struct *tsk)
383 {
384 	struct page *page;
385 
386 	spin_lock_bh(&mm->context.list_lock);
387 	/* Free shadow region and segment tables. */
388 	list_for_each_entry(page, &mm->context.crst_list, lru)
389 		if (page->index) {
390 			free_pages((unsigned long) page->index, ALLOC_ORDER);
391 			page->index = 0;
392 		}
393 	/* "Free" second halves of page tables. */
394 	list_for_each_entry(page, &mm->context.pgtable_list, lru)
395 		page->flags &= ~SECOND_HALVES;
396 	spin_unlock_bh(&mm->context.list_lock);
397 	mm->context.noexec = 0;
398 	update_mm(mm, tsk);
399 }
400 
401 /*
402  * switch on pgstes for its userspace process (for kvm)
403  */
s390_enable_sie(void)404 int s390_enable_sie(void)
405 {
406 	struct task_struct *tsk = current;
407 	struct mm_struct *mm, *old_mm;
408 
409 	/* Do we have switched amode? If no, we cannot do sie */
410 	if (user_mode == HOME_SPACE_MODE)
411 		return -EINVAL;
412 
413 	/* Do we have pgstes? if yes, we are done */
414 	if (tsk->mm->context.has_pgste)
415 		return 0;
416 
417 	/* lets check if we are allowed to replace the mm */
418 	task_lock(tsk);
419 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
420 #ifdef CONFIG_AIO
421 	    !hlist_empty(&tsk->mm->ioctx_list) ||
422 #endif
423 	    tsk->mm != tsk->active_mm) {
424 		task_unlock(tsk);
425 		return -EINVAL;
426 	}
427 	task_unlock(tsk);
428 
429 	/* we copy the mm and let dup_mm create the page tables with_pgstes */
430 	tsk->mm->context.alloc_pgste = 1;
431 	mm = dup_mm(tsk);
432 	tsk->mm->context.alloc_pgste = 0;
433 	if (!mm)
434 		return -ENOMEM;
435 
436 	/* Now lets check again if something happened */
437 	task_lock(tsk);
438 	if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
439 #ifdef CONFIG_AIO
440 	    !hlist_empty(&tsk->mm->ioctx_list) ||
441 #endif
442 	    tsk->mm != tsk->active_mm) {
443 		mmput(mm);
444 		task_unlock(tsk);
445 		return -EINVAL;
446 	}
447 
448 	/* ok, we are alone. No ptrace, no threads, etc. */
449 	old_mm = tsk->mm;
450 	tsk->mm = tsk->active_mm = mm;
451 	preempt_disable();
452 	update_mm(mm, tsk);
453 	atomic_inc(&mm->context.attach_count);
454 	atomic_dec(&old_mm->context.attach_count);
455 	cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
456 	preempt_enable();
457 	task_unlock(tsk);
458 	mmput(old_mm);
459 	return 0;
460 }
461 EXPORT_SYMBOL_GPL(s390_enable_sie);
462 
463 #if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
kernel_page_present(struct page * page)464 bool kernel_page_present(struct page *page)
465 {
466 	unsigned long addr;
467 	int cc;
468 
469 	addr = page_to_phys(page);
470 	asm volatile(
471 		"	lra	%1,0(%1)\n"
472 		"	ipm	%0\n"
473 		"	srl	%0,28"
474 		: "=d" (cc), "+a" (addr) : : "cc");
475 	return cc == 0;
476 }
477 #endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */
478