1 /*
2  *  linux/arch/i386/mm/fault.c
3  *
4  *  Copyright (C) 1995  Linus Torvalds
5  */
6 
7 #include <linux/signal.h>
8 #include <linux/sched.h>
9 #include <linux/kernel.h>
10 #include <linux/errno.h>
11 #include <linux/string.h>
12 #include <linux/types.h>
13 #include <linux/ptrace.h>
14 #include <linux/mman.h>
15 #include <linux/mm.h>
16 #include <linux/smp.h>
17 #include <linux/smp_lock.h>
18 #include <linux/interrupt.h>
19 #include <linux/init.h>
20 #include <linux/tty.h>
21 #include <linux/vt_kern.h>		/* For unblank_screen() */
22 
23 #include <asm/system.h>
24 #include <asm/uaccess.h>
25 #include <asm/pgalloc.h>
26 #include <asm/hardirq.h>
27 
28 extern void die(const char *,struct pt_regs *,long);
29 
30 /*
31  * Ugly, ugly, but the goto's result in better assembly..
32  */
__verify_write(const void * addr,unsigned long size)33 int __verify_write(const void * addr, unsigned long size)
34 {
35 	struct vm_area_struct * vma;
36 	unsigned long start = (unsigned long) addr;
37 
38 	if (!size)
39 		return 1;
40 
41 	vma = find_vma(current->mm, start);
42 	if (!vma)
43 		goto bad_area;
44 	if (vma->vm_start > start)
45 		goto check_stack;
46 
47 good_area:
48 	if (!(vma->vm_flags & VM_WRITE))
49 		goto bad_area;
50 	size--;
51 	size += start & ~PAGE_MASK;
52 	size >>= PAGE_SHIFT;
53 	start &= PAGE_MASK;
54 
55 	for (;;) {
56 	survive:
57 		{
58 			int fault = handle_mm_fault(current->mm, vma, start, 1);
59 			if (!fault)
60 				goto bad_area;
61 			if (fault < 0)
62 				goto out_of_memory;
63 		}
64 		if (!size)
65 			break;
66 		size--;
67 		start += PAGE_SIZE;
68 		if (start < vma->vm_end)
69 			continue;
70 		vma = vma->vm_next;
71 		if (!vma || vma->vm_start != start)
72 			goto bad_area;
73 		if (!(vma->vm_flags & VM_WRITE))
74 			goto bad_area;
75 	}
76 	return 1;
77 
78 check_stack:
79 	if (!(vma->vm_flags & VM_GROWSDOWN))
80 		goto bad_area;
81 	if (expand_stack(vma, start) == 0)
82 		goto good_area;
83 
84 bad_area:
85 	return 0;
86 
87 out_of_memory:
88 	if (current->pid == 1) {
89 		yield();
90 		goto survive;
91 	}
92 	goto bad_area;
93 }
94 
95 extern spinlock_t timerlist_lock;
96 
97 /*
98  * Unlock any spinlocks which will prevent us from getting the
99  * message out (timerlist_lock is acquired through the
100  * console unblank code)
101  */
bust_spinlocks(int yes)102 void bust_spinlocks(int yes)
103 {
104 	spin_lock_init(&timerlist_lock);
105 	if (yes) {
106 		oops_in_progress = 1;
107 #ifdef CONFIG_SMP
108 		global_irq_lock = 0;	/* Many serial drivers do __global_cli() */
109 #endif
110 	} else {
111 		int loglevel_save = console_loglevel;
112 #ifdef CONFIG_VT
113 		unblank_screen();
114 #endif
115 		oops_in_progress = 0;
116 		/*
117 		 * OK, the message is on the console.  Now we call printk()
118 		 * without oops_in_progress set so that printk will give klogd
119 		 * a poke.  Hold onto your hats...
120 		 */
121 		console_loglevel = 15;		/* NMI oopser may have shut the console up */
122 		printk(" ");
123 		console_loglevel = loglevel_save;
124 	}
125 }
126 
127 asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
128 extern unsigned long idt;
129 
130 /*
131  * This routine handles page faults.  It determines the address,
132  * and the problem, and then passes it off to one of the appropriate
133  * routines.
134  *
135  * error_code:
136  *	bit 0 == 0 means no page found, 1 means protection fault
137  *	bit 1 == 0 means read, 1 means write
138  *	bit 2 == 0 means kernel, 1 means user-mode
139  */
do_page_fault(struct pt_regs * regs,unsigned long error_code)140 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
141 {
142 	struct task_struct *tsk;
143 	struct mm_struct *mm;
144 	struct vm_area_struct * vma;
145 	unsigned long address;
146 	unsigned long page;
147 	unsigned long fixup;
148 	int write;
149 	siginfo_t info;
150 
151 	/* get the address */
152 	__asm__("movl %%cr2,%0":"=r" (address));
153 
154 	/* It's safe to allow irq's after cr2 has been saved */
155 	if (regs->eflags & X86_EFLAGS_IF)
156 		local_irq_enable();
157 
158 	tsk = current;
159 
160 	/*
161 	 * We fault-in kernel-space virtual memory on-demand. The
162 	 * 'reference' page table is init_mm.pgd.
163 	 *
164 	 * NOTE! We MUST NOT take any locks for this case. We may
165 	 * be in an interrupt or a critical region, and should
166 	 * only copy the information from the master page table,
167 	 * nothing more.
168 	 *
169 	 * This verifies that the fault happens in kernel space
170 	 * (error_code & 4) == 0, and that the fault was not a
171 	 * protection error (error_code & 1) == 0.
172 	 */
173 	if (address >= TASK_SIZE && !(error_code & 5))
174 		goto vmalloc_fault;
175 
176 	mm = tsk->mm;
177 	info.si_code = SEGV_MAPERR;
178 
179 	/*
180 	 * If we're in an interrupt or have no user
181 	 * context, we must not take the fault..
182 	 */
183 	if (in_interrupt() || !mm)
184 		goto no_context;
185 
186 	down_read(&mm->mmap_sem);
187 
188 	vma = find_vma(mm, address);
189 	if (!vma)
190 		goto bad_area;
191 	if (vma->vm_start <= address)
192 		goto good_area;
193 	if (!(vma->vm_flags & VM_GROWSDOWN))
194 		goto bad_area;
195 	if (error_code & 4) {
196 		/*
197 		 * accessing the stack below %esp is always a bug.
198 		 * The "+ 32" is there due to some instructions (like
199 		 * pusha) doing post-decrement on the stack and that
200 		 * doesn't show up until later..
201 		 */
202 		if (address + 32 < regs->esp)
203 			goto bad_area;
204 	}
205 	if (expand_stack(vma, address))
206 		goto bad_area;
207 /*
208  * Ok, we have a good vm_area for this memory access, so
209  * we can handle it..
210  */
211 good_area:
212 	info.si_code = SEGV_ACCERR;
213 	write = 0;
214 	switch (error_code & 3) {
215 		default:	/* 3: write, present */
216 #ifdef TEST_VERIFY_AREA
217 			if (regs->cs == KERNEL_CS)
218 				printk("WP fault at %08lx\n", regs->eip);
219 #endif
220 			/* fall through */
221 		case 2:		/* write, not present */
222 			if (!(vma->vm_flags & VM_WRITE))
223 				goto bad_area;
224 			write++;
225 			break;
226 		case 1:		/* read, present */
227 			goto bad_area;
228 		case 0:		/* read, not present */
229 			if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
230 				goto bad_area;
231 	}
232 
233  survive:
234 	/*
235 	 * If for any reason at all we couldn't handle the fault,
236 	 * make sure we exit gracefully rather than endlessly redo
237 	 * the fault.
238 	 */
239 	switch (handle_mm_fault(mm, vma, address, write)) {
240 	case 1:
241 		tsk->min_flt++;
242 		break;
243 	case 2:
244 		tsk->maj_flt++;
245 		break;
246 	case 0:
247 		goto do_sigbus;
248 	default:
249 		goto out_of_memory;
250 	}
251 
252 	/*
253 	 * Did it hit the DOS screen memory VA from vm86 mode?
254 	 */
255 	if (regs->eflags & VM_MASK) {
256 		unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
257 		if (bit < 32)
258 			tsk->thread.screen_bitmap |= 1 << bit;
259 	}
260 	up_read(&mm->mmap_sem);
261 	return;
262 
263 /*
264  * Something tried to access memory that isn't in our memory map..
265  * Fix it, but check if it's kernel or user first..
266  */
267 bad_area:
268 	up_read(&mm->mmap_sem);
269 
270 	/* User mode accesses just cause a SIGSEGV */
271 	if (error_code & 4) {
272 		tsk->thread.cr2 = address;
273 		/* Kernel addresses are always protection faults */
274 		tsk->thread.error_code = error_code | (address >= TASK_SIZE);
275 		tsk->thread.trap_no = 14;
276 		info.si_signo = SIGSEGV;
277 		info.si_errno = 0;
278 		/* info.si_code has been set above */
279 		info.si_addr = (void *)address;
280 		force_sig_info(SIGSEGV, &info, tsk);
281 		return;
282 	}
283 
284 	/*
285 	 * Pentium F0 0F C7 C8 bug workaround.
286 	 */
287 	if (boot_cpu_data.f00f_bug) {
288 		unsigned long nr;
289 
290 		nr = (address - idt) >> 3;
291 
292 		if (nr == 6) {
293 			do_invalid_op(regs, 0);
294 			return;
295 		}
296 	}
297 
298 no_context:
299 	/* Are we prepared to handle this kernel fault?  */
300 	if ((fixup = search_exception_table(regs->eip)) != 0) {
301 		regs->eip = fixup;
302 		return;
303 	}
304 
305 /*
306  * Oops. The kernel tried to access some bad page. We'll have to
307  * terminate things with extreme prejudice.
308  */
309 
310 	bust_spinlocks(1);
311 
312 	if (address < PAGE_SIZE)
313 		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
314 	else
315 		printk(KERN_ALERT "Unable to handle kernel paging request");
316 	printk(" at virtual address %08lx\n",address);
317 	printk(" printing eip:\n");
318 	printk("%08lx\n", regs->eip);
319 	asm("movl %%cr3,%0":"=r" (page));
320 	page = ((unsigned long *) __va(page))[address >> 22];
321 	printk(KERN_ALERT "*pde = %08lx\n", page);
322 	if (page & 1) {
323 		page &= PAGE_MASK;
324 		address &= 0x003ff000;
325 		page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
326 		printk(KERN_ALERT "*pte = %08lx\n", page);
327 	}
328 	die("Oops", regs, error_code);
329 	bust_spinlocks(0);
330 	do_exit(SIGKILL);
331 
332 /*
333  * We ran out of memory, or some other thing happened to us that made
334  * us unable to handle the page fault gracefully.
335  */
336 out_of_memory:
337 	if (tsk->pid == 1) {
338 		yield();
339 		goto survive;
340 	}
341 	up_read(&mm->mmap_sem);
342 	printk("VM: killing process %s\n", tsk->comm);
343 	if (error_code & 4)
344 		do_exit(SIGKILL);
345 	goto no_context;
346 
347 do_sigbus:
348 	up_read(&mm->mmap_sem);
349 
350 	/*
351 	 * Send a sigbus, regardless of whether we were in kernel
352 	 * or user mode.
353 	 */
354 	tsk->thread.cr2 = address;
355 	tsk->thread.error_code = error_code;
356 	tsk->thread.trap_no = 14;
357 	info.si_signo = SIGBUS;
358 	info.si_errno = 0;
359 	info.si_code = BUS_ADRERR;
360 	info.si_addr = (void *)address;
361 	force_sig_info(SIGBUS, &info, tsk);
362 
363 	/* Kernel mode? Handle exceptions or die */
364 	if (!(error_code & 4))
365 		goto no_context;
366 	return;
367 
368 vmalloc_fault:
369 	{
370 		/*
371 		 * Synchronize this task's top level page-table
372 		 * with the 'reference' page table.
373 		 *
374 		 * Do _not_ use "tsk" here. We might be inside
375 		 * an interrupt in the middle of a task switch..
376 		 */
377 		int offset = __pgd_offset(address);
378 		pgd_t *pgd, *pgd_k;
379 		pmd_t *pmd, *pmd_k;
380 		pte_t *pte_k;
381 
382 		asm("movl %%cr3,%0":"=r" (pgd));
383 		pgd = offset + (pgd_t *)__va(pgd);
384 		pgd_k = init_mm.pgd + offset;
385 
386 		if (!pgd_present(*pgd_k))
387 			goto no_context;
388 		set_pgd(pgd, *pgd_k);
389 
390 		pmd = pmd_offset(pgd, address);
391 		pmd_k = pmd_offset(pgd_k, address);
392 		if (!pmd_present(*pmd_k))
393 			goto no_context;
394 		set_pmd(pmd, *pmd_k);
395 
396 		pte_k = pte_offset(pmd_k, address);
397 		if (!pte_present(*pte_k))
398 			goto no_context;
399 		return;
400 	}
401 }
402