1 /*
2 * linux/arch/i386/mm/fault.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 */
6
7 #include <linux/signal.h>
8 #include <linux/sched.h>
9 #include <linux/kernel.h>
10 #include <linux/errno.h>
11 #include <linux/string.h>
12 #include <linux/types.h>
13 #include <linux/ptrace.h>
14 #include <linux/mman.h>
15 #include <linux/mm.h>
16 #include <linux/smp.h>
17 #include <linux/smp_lock.h>
18 #include <linux/interrupt.h>
19 #include <linux/init.h>
20 #include <linux/tty.h>
21 #include <linux/vt_kern.h> /* For unblank_screen() */
22
23 #include <asm/system.h>
24 #include <asm/uaccess.h>
25 #include <asm/pgalloc.h>
26 #include <asm/hardirq.h>
27
28 extern void die(const char *,struct pt_regs *,long);
29
30 /*
31 * Ugly, ugly, but the goto's result in better assembly..
32 */
__verify_write(const void * addr,unsigned long size)33 int __verify_write(const void * addr, unsigned long size)
34 {
35 struct vm_area_struct * vma;
36 unsigned long start = (unsigned long) addr;
37
38 if (!size)
39 return 1;
40
41 vma = find_vma(current->mm, start);
42 if (!vma)
43 goto bad_area;
44 if (vma->vm_start > start)
45 goto check_stack;
46
47 good_area:
48 if (!(vma->vm_flags & VM_WRITE))
49 goto bad_area;
50 size--;
51 size += start & ~PAGE_MASK;
52 size >>= PAGE_SHIFT;
53 start &= PAGE_MASK;
54
55 for (;;) {
56 survive:
57 {
58 int fault = handle_mm_fault(current->mm, vma, start, 1);
59 if (!fault)
60 goto bad_area;
61 if (fault < 0)
62 goto out_of_memory;
63 }
64 if (!size)
65 break;
66 size--;
67 start += PAGE_SIZE;
68 if (start < vma->vm_end)
69 continue;
70 vma = vma->vm_next;
71 if (!vma || vma->vm_start != start)
72 goto bad_area;
73 if (!(vma->vm_flags & VM_WRITE))
74 goto bad_area;
75 }
76 return 1;
77
78 check_stack:
79 if (!(vma->vm_flags & VM_GROWSDOWN))
80 goto bad_area;
81 if (expand_stack(vma, start) == 0)
82 goto good_area;
83
84 bad_area:
85 return 0;
86
87 out_of_memory:
88 if (current->pid == 1) {
89 yield();
90 goto survive;
91 }
92 goto bad_area;
93 }
94
95 extern spinlock_t timerlist_lock;
96
97 /*
98 * Unlock any spinlocks which will prevent us from getting the
99 * message out (timerlist_lock is acquired through the
100 * console unblank code)
101 */
bust_spinlocks(int yes)102 void bust_spinlocks(int yes)
103 {
104 spin_lock_init(&timerlist_lock);
105 if (yes) {
106 oops_in_progress = 1;
107 #ifdef CONFIG_SMP
108 global_irq_lock = 0; /* Many serial drivers do __global_cli() */
109 #endif
110 } else {
111 int loglevel_save = console_loglevel;
112 #ifdef CONFIG_VT
113 unblank_screen();
114 #endif
115 oops_in_progress = 0;
116 /*
117 * OK, the message is on the console. Now we call printk()
118 * without oops_in_progress set so that printk will give klogd
119 * a poke. Hold onto your hats...
120 */
121 console_loglevel = 15; /* NMI oopser may have shut the console up */
122 printk(" ");
123 console_loglevel = loglevel_save;
124 }
125 }
126
127 asmlinkage void do_invalid_op(struct pt_regs *, unsigned long);
128 extern unsigned long idt;
129
130 /*
131 * This routine handles page faults. It determines the address,
132 * and the problem, and then passes it off to one of the appropriate
133 * routines.
134 *
135 * error_code:
136 * bit 0 == 0 means no page found, 1 means protection fault
137 * bit 1 == 0 means read, 1 means write
138 * bit 2 == 0 means kernel, 1 means user-mode
139 */
do_page_fault(struct pt_regs * regs,unsigned long error_code)140 asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long error_code)
141 {
142 struct task_struct *tsk;
143 struct mm_struct *mm;
144 struct vm_area_struct * vma;
145 unsigned long address;
146 unsigned long page;
147 unsigned long fixup;
148 int write;
149 siginfo_t info;
150
151 /* get the address */
152 __asm__("movl %%cr2,%0":"=r" (address));
153
154 /* It's safe to allow irq's after cr2 has been saved */
155 if (regs->eflags & X86_EFLAGS_IF)
156 local_irq_enable();
157
158 tsk = current;
159
160 /*
161 * We fault-in kernel-space virtual memory on-demand. The
162 * 'reference' page table is init_mm.pgd.
163 *
164 * NOTE! We MUST NOT take any locks for this case. We may
165 * be in an interrupt or a critical region, and should
166 * only copy the information from the master page table,
167 * nothing more.
168 *
169 * This verifies that the fault happens in kernel space
170 * (error_code & 4) == 0, and that the fault was not a
171 * protection error (error_code & 1) == 0.
172 */
173 if (address >= TASK_SIZE && !(error_code & 5))
174 goto vmalloc_fault;
175
176 mm = tsk->mm;
177 info.si_code = SEGV_MAPERR;
178
179 /*
180 * If we're in an interrupt or have no user
181 * context, we must not take the fault..
182 */
183 if (in_interrupt() || !mm)
184 goto no_context;
185
186 down_read(&mm->mmap_sem);
187
188 vma = find_vma(mm, address);
189 if (!vma)
190 goto bad_area;
191 if (vma->vm_start <= address)
192 goto good_area;
193 if (!(vma->vm_flags & VM_GROWSDOWN))
194 goto bad_area;
195 if (error_code & 4) {
196 /*
197 * accessing the stack below %esp is always a bug.
198 * The "+ 32" is there due to some instructions (like
199 * pusha) doing post-decrement on the stack and that
200 * doesn't show up until later..
201 */
202 if (address + 32 < regs->esp)
203 goto bad_area;
204 }
205 if (expand_stack(vma, address))
206 goto bad_area;
207 /*
208 * Ok, we have a good vm_area for this memory access, so
209 * we can handle it..
210 */
211 good_area:
212 info.si_code = SEGV_ACCERR;
213 write = 0;
214 switch (error_code & 3) {
215 default: /* 3: write, present */
216 #ifdef TEST_VERIFY_AREA
217 if (regs->cs == KERNEL_CS)
218 printk("WP fault at %08lx\n", regs->eip);
219 #endif
220 /* fall through */
221 case 2: /* write, not present */
222 if (!(vma->vm_flags & VM_WRITE))
223 goto bad_area;
224 write++;
225 break;
226 case 1: /* read, present */
227 goto bad_area;
228 case 0: /* read, not present */
229 if (!(vma->vm_flags & (VM_READ | VM_EXEC)))
230 goto bad_area;
231 }
232
233 survive:
234 /*
235 * If for any reason at all we couldn't handle the fault,
236 * make sure we exit gracefully rather than endlessly redo
237 * the fault.
238 */
239 switch (handle_mm_fault(mm, vma, address, write)) {
240 case 1:
241 tsk->min_flt++;
242 break;
243 case 2:
244 tsk->maj_flt++;
245 break;
246 case 0:
247 goto do_sigbus;
248 default:
249 goto out_of_memory;
250 }
251
252 /*
253 * Did it hit the DOS screen memory VA from vm86 mode?
254 */
255 if (regs->eflags & VM_MASK) {
256 unsigned long bit = (address - 0xA0000) >> PAGE_SHIFT;
257 if (bit < 32)
258 tsk->thread.screen_bitmap |= 1 << bit;
259 }
260 up_read(&mm->mmap_sem);
261 return;
262
263 /*
264 * Something tried to access memory that isn't in our memory map..
265 * Fix it, but check if it's kernel or user first..
266 */
267 bad_area:
268 up_read(&mm->mmap_sem);
269
270 /* User mode accesses just cause a SIGSEGV */
271 if (error_code & 4) {
272 tsk->thread.cr2 = address;
273 /* Kernel addresses are always protection faults */
274 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
275 tsk->thread.trap_no = 14;
276 info.si_signo = SIGSEGV;
277 info.si_errno = 0;
278 /* info.si_code has been set above */
279 info.si_addr = (void *)address;
280 force_sig_info(SIGSEGV, &info, tsk);
281 return;
282 }
283
284 /*
285 * Pentium F0 0F C7 C8 bug workaround.
286 */
287 if (boot_cpu_data.f00f_bug) {
288 unsigned long nr;
289
290 nr = (address - idt) >> 3;
291
292 if (nr == 6) {
293 do_invalid_op(regs, 0);
294 return;
295 }
296 }
297
298 no_context:
299 /* Are we prepared to handle this kernel fault? */
300 if ((fixup = search_exception_table(regs->eip)) != 0) {
301 regs->eip = fixup;
302 return;
303 }
304
305 /*
306 * Oops. The kernel tried to access some bad page. We'll have to
307 * terminate things with extreme prejudice.
308 */
309
310 bust_spinlocks(1);
311
312 if (address < PAGE_SIZE)
313 printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
314 else
315 printk(KERN_ALERT "Unable to handle kernel paging request");
316 printk(" at virtual address %08lx\n",address);
317 printk(" printing eip:\n");
318 printk("%08lx\n", regs->eip);
319 asm("movl %%cr3,%0":"=r" (page));
320 page = ((unsigned long *) __va(page))[address >> 22];
321 printk(KERN_ALERT "*pde = %08lx\n", page);
322 if (page & 1) {
323 page &= PAGE_MASK;
324 address &= 0x003ff000;
325 page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT];
326 printk(KERN_ALERT "*pte = %08lx\n", page);
327 }
328 die("Oops", regs, error_code);
329 bust_spinlocks(0);
330 do_exit(SIGKILL);
331
332 /*
333 * We ran out of memory, or some other thing happened to us that made
334 * us unable to handle the page fault gracefully.
335 */
336 out_of_memory:
337 if (tsk->pid == 1) {
338 yield();
339 goto survive;
340 }
341 up_read(&mm->mmap_sem);
342 printk("VM: killing process %s\n", tsk->comm);
343 if (error_code & 4)
344 do_exit(SIGKILL);
345 goto no_context;
346
347 do_sigbus:
348 up_read(&mm->mmap_sem);
349
350 /*
351 * Send a sigbus, regardless of whether we were in kernel
352 * or user mode.
353 */
354 tsk->thread.cr2 = address;
355 tsk->thread.error_code = error_code;
356 tsk->thread.trap_no = 14;
357 info.si_signo = SIGBUS;
358 info.si_errno = 0;
359 info.si_code = BUS_ADRERR;
360 info.si_addr = (void *)address;
361 force_sig_info(SIGBUS, &info, tsk);
362
363 /* Kernel mode? Handle exceptions or die */
364 if (!(error_code & 4))
365 goto no_context;
366 return;
367
368 vmalloc_fault:
369 {
370 /*
371 * Synchronize this task's top level page-table
372 * with the 'reference' page table.
373 *
374 * Do _not_ use "tsk" here. We might be inside
375 * an interrupt in the middle of a task switch..
376 */
377 int offset = __pgd_offset(address);
378 pgd_t *pgd, *pgd_k;
379 pmd_t *pmd, *pmd_k;
380 pte_t *pte_k;
381
382 asm("movl %%cr3,%0":"=r" (pgd));
383 pgd = offset + (pgd_t *)__va(pgd);
384 pgd_k = init_mm.pgd + offset;
385
386 if (!pgd_present(*pgd_k))
387 goto no_context;
388 set_pgd(pgd, *pgd_k);
389
390 pmd = pmd_offset(pgd, address);
391 pmd_k = pmd_offset(pgd_k, address);
392 if (!pmd_present(*pmd_k))
393 goto no_context;
394 set_pmd(pmd, *pmd_k);
395
396 pte_k = pte_offset(pmd_k, address);
397 if (!pte_present(*pte_k))
398 goto no_context;
399 return;
400 }
401 }
402