/* * linux/arch/x86-64/kernel/process.c * * Copyright (C) 1995 Linus Torvalds * * Pentium III FXSR, SSE support * Gareth Hughes , May 2000 * * X86-64 port * Andi Kleen. * * $Id: process.c,v 1.77 2004/03/22 00:37:29 ak Exp $ */ /* * This file handles the architecture-dependent parts of process handling.. */ #define __KERNEL_SYSCALLS__ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include asmlinkage extern void ret_from_fork(void); int hlt_counter; /* * Powermanagement idle function, if any.. */ void (*pm_idle)(void); /* * Power off function, if any */ void (*pm_power_off)(void); void disable_hlt(void) { hlt_counter++; } void enable_hlt(void) { hlt_counter--; } /* * We use this if we don't have any better * idle routine.. */ static void default_idle(void) { if (!hlt_counter) { __cli(); if (!current->need_resched) safe_halt(); else __sti(); } } /* * On SMP it's slightly faster (but much more power-consuming!) * to poll the ->need_resched flag instead of waiting for the * cross-CPU IPI to arrive. Use this option with caution. */ static void poll_idle (void) { int oldval; __sti(); /* * Deal with another CPU just having chosen a thread to * run here: */ oldval = xchg(¤t->need_resched, -1); if (!oldval) asm volatile( "2:" "cmpl $-1, %0;" "rep; nop;" "je 2b;" : :"m" (current->need_resched)); } /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a * low exit latency (ie sit in a loop waiting for * somebody to say that they'd like to reschedule) */ void cpu_idle (void) { /* endless idle loop with no priority at all */ init_idle(); current->nice = 20; current->counter = -100; while (1) { void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; while (!current->need_resched) idle(); schedule(); check_pgt_cache(); } } /* * This is a kind of hybrid between poll and halt idle routines. This uses new * Monitor/Mwait instructions on P4 processors with PNI. We Monitor * need_resched and go to optimized wait state through Mwait. * Whenever someone changes need_resched, we would be woken up from Mwait * (without an IPI). */ static void mwait_idle (void) { int oldval; __sti(); /* Setting need_resched to -1 skips sending IPI during idle resched */ oldval = xchg(¤t->need_resched, -1); if (!oldval) { do { __monitor((void *)¤t->need_resched, 0, 0); if (current->need_resched != -1) break; __mwait(0, 0); } while (current->need_resched == -1); } } int __init select_idle_routine(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_MWAIT)) { printk("Monitor/Mwait feature present.\n"); /* * Take care of system with asymmetric CPUs. * Use, mwait_idle only if all cpus support it. * If not, we fallback to default_idle() */ if (!pm_idle) { pm_idle = mwait_idle; } return 1; } return 1; } static int __init idle_setup (char *str) { if (!strncmp(str, "poll", 4)) { printk("using polling idle threads.\n"); pm_idle = poll_idle; } else if (!strncmp(str, "halt", 4)) { printk("using halt in idle threads.\n"); pm_idle = default_idle; } return 1; } __setup("idle=", idle_setup); static struct { long x; } no_idt[3]; static enum { BOOT_BIOS = 'b', BOOT_TRIPLE = 't', BOOT_KBD = 'k', } reboot_type = BOOT_KBD; static int reboot_mode = 0; /* reboot=b[ios] | t[riple] | k[bd] [, [w]arm | [c]old] bios Use the CPU reboot vector for warm reset warm Don't set the cold reboot flag cold Set the cold reboto flag triple Force a triple fault (init) kbd Use the keyboard controller. cold reset (default) */ static int __init reboot_setup(char *str) { for (;;) { switch (*str) { case 'w': reboot_mode = 0x1234; break; case 'c': reboot_mode = 0; break; case 't': case 'b': case 'k': reboot_type = *str; break; } if((str = strchr(str,',')) != NULL) str++; else break; } return 1; } __setup("reboot=", reboot_setup); /* overwrites random kernel memory. Should not be kernel .text */ #define WARMBOOT_TRAMP 0x1000UL static void reboot_warm(void) { extern unsigned char warm_reboot[], warm_reboot_end[]; printk("warm reboot\n"); __cli(); /* restore identity mapping */ init_level4_pgt[0] = __pml4(__pa(level3_ident_pgt) | 7); __flush_tlb_all(); memcpy(__va(WARMBOOT_TRAMP), warm_reboot, warm_reboot_end - warm_reboot); asm volatile( " pushq $0\n" /* ss */ " pushq $0x2000\n" /* rsp */ " pushfq\n" /* eflags */ " pushq %[cs]\n" " pushq %[target]\n" " iretq" :: [cs] "i" (__KERNEL_COMPAT32_CS), [target] "b" (WARMBOOT_TRAMP)); } static void kb_wait(void) { int i; for (i=0; i<0x10000; i++) if ((inb_p(0x64) & 0x02) == 0) break; } #ifdef CONFIG_SMP static void smp_halt(void) { int cpuid = safe_smp_processor_id(); static int first_entry = 1; if (first_entry) { first_entry = 0; smp_call_function((void *)machine_restart, NULL, 1, 0); } smp_stop_cpu(); /* AP calling this. Just halt */ if (cpuid != boot_cpu_id) { printk("CPU %d SMP halt\n", cpuid); for (;;) asm("hlt"); } /* Wait for all other CPUs to have run smp_stop_cpu */ while (cpu_online_map) rep_nop(); } #endif void machine_restart(char * __unused) { int i; #if CONFIG_SMP smp_halt(); #endif __cli(); #ifndef CONFIG_SMP disable_local_APIC(); #endif disable_IO_APIC(); __sti(); /* Tell the BIOS if we want cold or warm reboot */ *((unsigned short *)__va(0x472)) = reboot_mode; for (;;) { /* Could also try the reset bit in the Hammer NB */ switch (reboot_type) { case BOOT_BIOS: reboot_warm(); case BOOT_KBD: /* force cold reboot to reinit all hardware*/ for (i=0; i<100; i++) { kb_wait(); udelay(50); outb(0xfe,0x64); /* pulse reset low */ udelay(50); } case BOOT_TRIPLE: /* force cold reboot to reinit all hardware*/ *((unsigned short *)__va(0x472)) = 0; __asm__ __volatile__("lidt (%0)": :"r" (no_idt)); __asm__ __volatile__("int3"); reboot_type = BOOT_KBD; break; } } } void machine_halt(void) { } void machine_power_off(void) { if (pm_power_off) pm_power_off(); } extern int printk_address(unsigned long); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned int fsindex,gsindex; unsigned int ds,cs,es; printk("\n"); printk("Pid: %d, comm: %.20s %s\n", current->pid, current->comm, print_tainted()); printk("RIP: %04lx:", regs->cs & 0xffff); printk_address(regs->rip); printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp, regs->eflags); printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->rax, regs->rbx, regs->rcx); printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->rdx, regs->rsi, regs->rdi); printk("RBP: %016lx R08: %016lx R09: %016lx\n", regs->rbp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); asm("movq %%cr0, %0": "=r" (cr0)); asm("movq %%cr2, %0": "=r" (cr2)); asm("movq %%cr3, %0": "=r" (cr3)); asm("movq %%cr4, %0": "=r" (cr4)); printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs,fsindex,gs,gsindex,shadowgs); printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); } void show_regs(struct pt_regs * regs) { __show_regs(regs); show_trace(®s->rsp); } /* * No need to lock the MM as we are the last user */ void release_segments(struct mm_struct *mm) { void * ldt = mm->context.segments; /* * free the LDT */ if (ldt) { mm->context.segments = NULL; clear_LDT(); vfree(ldt); } } /* * Free current thread data structures etc.. */ void exit_thread(void) { struct task_struct *me = current; if (me->thread.io_bitmap_ptr) { (init_tss + smp_processor_id())->io_map_base = INVALID_IO_BITMAP_OFFSET; kfree(me->thread.io_bitmap_ptr); me->thread.io_bitmap_ptr = NULL; } } void flush_thread(void) { struct task_struct *tsk = current; memset(tsk->thread.debugreg, 0, sizeof(unsigned long)*8); /* * Forget coprocessor state.. */ clear_fpu(tsk); tsk->used_math = 0; } void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { void * ldt = dead_task->mm->context.segments; // temporary debugging check if (ldt) { printk("WARNING: dead process %8s still has LDT? <%p>\n", dead_task->comm, ldt); BUG(); } } } /* * we do not have to muck with descriptors here, that is * done in switch_mm() as needed. */ void copy_segments(struct task_struct *p, struct mm_struct *new_mm) { struct mm_struct * old_mm; void *old_ldt, *ldt; ldt = NULL; old_mm = current->mm; if (old_mm && (old_ldt = old_mm->context.segments) != NULL) { /* * Completely new LDT, we initialize it from the parent: */ ldt = vmalloc(LDT_ENTRIES*LDT_ENTRY_SIZE); if (!ldt) printk(KERN_WARNING "ldt allocation failed\n"); else memcpy(ldt, old_ldt, LDT_ENTRIES*LDT_ENTRY_SIZE); } new_mm->context.segments = ldt; new_mm->context.cpuvalid = 0UL; return; } int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, unsigned long unused, struct task_struct * p, struct pt_regs * regs) { struct pt_regs * childregs; struct task_struct *me = current; childregs = ((struct pt_regs *) (THREAD_SIZE + (unsigned long) p)) - 1; *childregs = *regs; childregs->rax = 0; childregs->rsp = rsp; if (rsp == ~0) { childregs->rsp = (unsigned long)childregs; } p->thread.rsp = (unsigned long) childregs; p->thread.rsp0 = (unsigned long) (childregs+1); p->thread.userrsp = current->thread.userrsp; p->thread.rip = (unsigned long) ret_from_fork; p->thread.fs = me->thread.fs; p->thread.gs = me->thread.gs; asm("mov %%gs,%0" : "=m" (p->thread.gsindex)); asm("mov %%fs,%0" : "=m" (p->thread.fsindex)); asm("mov %%es,%0" : "=m" (p->thread.es)); asm("mov %%ds,%0" : "=m" (p->thread.ds)); unlazy_fpu(current); p->thread.i387 = current->thread.i387; if (unlikely(me->thread.io_bitmap_ptr != NULL)) { p->thread.io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) return -ENOMEM; memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, (IO_BITMAP_SIZE+1)*4); } return 0; } /* * This special macro can be used to load a debugging register */ #define loaddebug(thread,register) \ set_debug(thread->debugreg[register], register) /* * switch_to(x,y) should switch tasks from x to y. * * This could still be optimized: * - fold all the options into a flag word and test it with a single test. * - could test fs/gs bitsliced */ struct task_struct *__switch_to(struct task_struct *prev_p, struct task_struct *next_p) { struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; struct tss_struct *tss = init_tss + smp_processor_id(); /* * Reload rsp0, LDT and the page table pointer: */ tss->rsp0 = next->rsp0; /* * Switch DS and ES. */ asm volatile("mov %%es,%0" : "=m" (prev->es)); if (unlikely(next->es | prev->es)) loadsegment(es, next->es); asm volatile ("mov %%ds,%0" : "=m" (prev->ds)); if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); /* * Must be after DS reload for AMD workaround. */ unlazy_fpu(prev_p); /* * Switch FS and GS. */ { unsigned fsindex; asm volatile("movl %%fs,%0" : "=r" (fsindex)); /* segment register != 0 always requires a reload. also reload when it has changed. when prev process used 64bit base always reload to avoid an information leak. */ if (unlikely((fsindex | next->fsindex) || prev->fs)) { loadsegment(fs, next->fsindex); /* check if the user use a selector != 0 * if yes clear 64bit base, since overloaded base * is allways mapped to the Null selector */ if (fsindex) prev->fs = 0; } /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; } { unsigned gsindex; asm volatile("movl %%gs,%0" : "=r" (gsindex)); if (unlikely((gsindex | next->gsindex) || prev->gs)) { load_gs_index(next->gsindex); if (gsindex) prev->gs = 0; } if (next->gs) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); prev->gsindex = gsindex; } /* * Switch the PDA context. */ prev->userrsp = read_pda(oldrsp); write_pda(oldrsp, next->userrsp); write_pda(pcurrent, next_p); write_pda(kernelstack, (unsigned long)next_p + THREAD_SIZE - PDA_STACKOFFSET); /* * Now maybe reload the debug registers */ if (unlikely(next->debugreg[7])) { loaddebug(next, 0); loaddebug(next, 1); loaddebug(next, 2); loaddebug(next, 3); /* no 4 and 5 */ loaddebug(next, 6); loaddebug(next, 7); } /* * Handle the IO bitmap */ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { if (next->io_bitmap_ptr) { /* * 4 cachelines copy ... not good, but not that * bad either. Anyone got something better? * This only affects processes which use ioperm(). * [Putting the TSSs into 4k-tlb mapped regions * and playing VM tricks to switch the IO bitmap * is not really acceptable.] */ memcpy(tss->io_bitmap, next->io_bitmap_ptr, IO_BITMAP_SIZE*sizeof(u32)); tss->io_map_base = IO_BITMAP_OFFSET; } else { /* * a bitmap offset pointing outside of the TSS limit * causes a nicely controllable SIGSEGV if a process * tries to use a port IO instruction. The first * sys_ioperm() call sets up the bitmap properly. */ tss->io_map_base = INVALID_IO_BITMAP_OFFSET; } } return prev_p; } /* * sys_execve() executes a new program. */ asmlinkage long sys_execve(char *name, char **argv,char **envp, struct pt_regs regs) { long error; char * filename; filename = getname(name); error = PTR_ERR(filename); if (IS_ERR(filename)) return error; error = do_execve(filename, argv, envp, ®s); if (error == 0) current->ptrace &= ~PT_DTRACE; putname(filename); return error; } void set_personality_64bit(void) { /* inherit personality from parent */ /* Make sure to be in 64bit mode */ current->thread.flags = 0; } asmlinkage long sys_fork(struct pt_regs regs) { return do_fork(SIGCHLD, regs.rsp, ®s, 0); } asmlinkage long sys_clone(unsigned long clone_flags, unsigned long newsp, struct pt_regs regs) { if (!newsp) newsp = regs.rsp; return do_fork(clone_flags, newsp, ®s, 0); } /* * This is trivial, and on the face of it looks like it * could equally well be done in user mode. * * Not so, for quite unobvious reasons - register pressure. * In user mode vfork() cannot have a stack frame, and if * done by calling the "clone()" system call directly, you * do not have enough call-clobbered registers to hold all * the information you need. */ asmlinkage long sys_vfork(struct pt_regs regs) { return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs.rsp, ®s, 0); } /* * These bracket the sleeping functions.. */ extern void scheduling_functions_start_here(void); extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) unsigned long get_wchan(struct task_struct *p) { u64 fp,rip; int count = 0; if (!p || p == current || p->state==TASK_RUNNING) return 0; if (p->thread.rsp < (u64)p || p->thread.rsp > (u64)p + THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.rsp); do { if (fp < (unsigned long)p || fp > (unsigned long)p+THREAD_SIZE) return 0; rip = *(u64 *)(fp+8); if (rip < first_sched || rip >= last_sched) return rip; fp = *(u64 *)fp; } while (count++ < 16); return 0; } #undef last_sched #undef first_sched asmlinkage long sys_arch_prctl(int code, unsigned long addr) { int ret = 0; unsigned long tmp; switch (code) { case ARCH_SET_GS: if (addr >= TASK_SIZE) return -EPERM; asm volatile("movl %0,%%gs" :: "r" (0)); current->thread.gsindex = 0; current->thread.gs = addr; ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); break; case ARCH_SET_FS: /* Not strictly needed for fs, but do it for symmetry with gs. */ if (addr >= TASK_SIZE) return -EPERM; asm volatile("movl %0,%%fs" :: "r" (0)); current->thread.fsindex = 0; current->thread.fs = addr; ret = checking_wrmsrl(MSR_FS_BASE, addr); break; /* Returned value may not be correct when the user changed fs/gs */ case ARCH_GET_FS: rdmsrl(MSR_FS_BASE, tmp); ret = put_user(tmp, (unsigned long *)addr); break; case ARCH_GET_GS: rdmsrl(MSR_KERNEL_GS_BASE, tmp); ret = put_user(tmp, (unsigned long *)addr); break; default: ret = -EINVAL; break; } return ret; }