1 use core::{ 2 arch::asm, 3 intrinsics::unlikely, 4 mem::ManuallyDrop, 5 sync::atomic::{compiler_fence, Ordering}, 6 }; 7 8 use alloc::sync::{Arc, Weak}; 9 10 use kdepends::memoffset::offset_of; 11 use log::{error, warn}; 12 use system_error::SystemError; 13 use x86::{controlregs::Cr4, segmentation::SegmentSelector}; 14 15 use crate::{ 16 arch::process::table::TSSManager, 17 exception::InterruptArch, 18 libs::spinlock::SpinLockGuard, 19 mm::VirtAddr, 20 process::{ 21 fork::{CloneFlags, KernelCloneArgs}, 22 KernelStack, ProcessControlBlock, ProcessFlags, ProcessManager, PROCESS_SWITCH_RESULT, 23 }, 24 syscall::Syscall, 25 }; 26 27 use self::{ 28 kthread::kernel_thread_bootstrap_stage1, 29 syscall::ARCH_SET_FS, 30 table::{switch_fs_and_gs, KERNEL_DS, USER_DS}, 31 }; 32 33 use super::{fpu::FpState, interrupt::TrapFrame, syscall::X86_64GSData, CurrentIrqArch}; 34 35 pub mod idle; 36 pub mod kthread; 37 pub mod syscall; 38 pub mod table; 39 40 extern "C" { 41 /// 从中断返回 42 fn ret_from_intr(); 43 } 44 45 #[allow(dead_code)] 46 #[repr(align(32768))] 47 union InitProcUnion { 48 /// 用于存放idle进程的内核栈 49 idle_stack: [u8; 32768], 50 } 51 52 #[link_section = ".data.init_proc_union"] 53 #[no_mangle] 54 static BSP_IDLE_STACK_SPACE: InitProcUnion = InitProcUnion { 55 idle_stack: [0; 32768], 56 }; 57 58 /// PCB中与架构相关的信息 59 #[derive(Debug)] 60 #[allow(dead_code)] 61 pub struct ArchPCBInfo { 62 rflags: usize, 63 rbx: usize, 64 r12: usize, 65 r13: usize, 66 r14: usize, 67 r15: usize, 68 rbp: usize, 69 rsp: usize, 70 rip: usize, 71 cr2: usize, 72 fsbase: usize, 73 gsbase: usize, 74 fs: SegmentSelector, 75 gs: SegmentSelector, 76 /// 存储PCB系统调用栈以及在syscall过程中暂存用户态rsp的结构体 77 gsdata: X86_64GSData, 78 /// 浮点寄存器的状态 79 fp_state: Option<FpState>, 80 } 81 82 #[allow(dead_code)] 83 impl ArchPCBInfo { 84 /// 创建一个新的ArchPCBInfo 85 /// 86 /// ## 参数 87 /// 88 /// - `kstack`:内核栈的引用,如果为None,则不会设置rsp和rbp。如果为Some,则会设置rsp和rbp为内核栈的最高地址。 89 /// 90 /// ## 返回值 91 /// 92 /// 返回一个新的ArchPCBInfo 93 #[inline(never)] 94 pub fn new(kstack: &KernelStack) -> Self { 95 let mut r = Self { 96 rflags: 0, 97 rbx: 0, 98 r12: 0, 99 r13: 0, 100 r14: 0, 101 r15: 0, 102 rbp: 0, 103 rsp: 0, 104 rip: 0, 105 cr2: 0, 106 fsbase: 0, 107 gsbase: 0, 108 gsdata: X86_64GSData { 109 kaddr: VirtAddr::new(0), 110 uaddr: VirtAddr::new(0), 111 }, 112 fs: KERNEL_DS, 113 gs: KERNEL_DS, 114 fp_state: None, 115 }; 116 117 r.rsp = kstack.stack_max_address().data() - 8; 118 r.rbp = kstack.stack_max_address().data(); 119 120 return r; 121 } 122 123 pub fn set_stack(&mut self, stack: VirtAddr) { 124 self.rsp = stack.data(); 125 } 126 127 pub fn set_stack_base(&mut self, stack_base: VirtAddr) { 128 self.rbp = stack_base.data(); 129 } 130 131 pub fn rbp(&self) -> usize { 132 self.rbp 133 } 134 135 pub unsafe fn push_to_stack(&mut self, value: usize) { 136 self.rsp -= core::mem::size_of::<usize>(); 137 *(self.rsp as *mut usize) = value; 138 } 139 140 pub unsafe fn pop_from_stack(&mut self) -> usize { 141 let value = *(self.rsp as *const usize); 142 self.rsp += core::mem::size_of::<usize>(); 143 value 144 } 145 146 pub fn save_fp_state(&mut self) { 147 if self.fp_state.is_none() { 148 self.fp_state = Some(FpState::new()); 149 } 150 151 self.fp_state.as_mut().unwrap().save(); 152 } 153 154 pub fn restore_fp_state(&mut self) { 155 if unlikely(self.fp_state.is_none()) { 156 return; 157 } 158 159 self.fp_state.as_mut().unwrap().restore(); 160 } 161 162 /// 返回浮点寄存器结构体的副本 163 pub fn fp_state(&self) -> &Option<FpState> { 164 &self.fp_state 165 } 166 167 // 清空浮点寄存器 168 pub fn clear_fp_state(&mut self) { 169 if unlikely(self.fp_state.is_none()) { 170 warn!("fp_state is none"); 171 return; 172 } 173 174 self.fp_state.as_mut().unwrap().clear(); 175 } 176 pub unsafe fn save_fsbase(&mut self) { 177 if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) { 178 self.fsbase = x86::current::segmentation::rdfsbase() as usize; 179 } else { 180 self.fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE) as usize; 181 } 182 } 183 184 pub unsafe fn save_gsbase(&mut self) { 185 if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) { 186 self.gsbase = x86::current::segmentation::rdgsbase() as usize; 187 } else { 188 self.gsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE) as usize; 189 } 190 } 191 192 pub unsafe fn restore_fsbase(&mut self) { 193 if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) { 194 x86::current::segmentation::wrfsbase(self.fsbase as u64); 195 } else { 196 x86::msr::wrmsr(x86::msr::IA32_FS_BASE, self.fsbase as u64); 197 } 198 } 199 200 pub unsafe fn restore_gsbase(&mut self) { 201 if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) { 202 x86::current::segmentation::wrgsbase(self.gsbase as u64); 203 } else { 204 x86::msr::wrmsr(x86::msr::IA32_GS_BASE, self.gsbase as u64); 205 } 206 } 207 208 /// 将gsdata写入KernelGsbase寄存器 209 pub unsafe fn store_kernel_gsbase(&self) { 210 x86::msr::wrmsr( 211 x86::msr::IA32_KERNEL_GSBASE, 212 &self.gsdata as *const X86_64GSData as u64, 213 ); 214 } 215 216 /// ### 初始化系统调用栈,不得与PCB内核栈冲突(即传入的应该是一个新的栈,避免栈损坏) 217 pub fn init_syscall_stack(&mut self, stack: &KernelStack) { 218 self.gsdata.set_kstack(stack.stack_max_address() - 8); 219 } 220 221 pub fn fsbase(&self) -> usize { 222 self.fsbase 223 } 224 225 pub fn gsbase(&self) -> usize { 226 self.gsbase 227 } 228 229 pub fn cr2_mut(&mut self) -> &mut usize { 230 &mut self.cr2 231 } 232 233 pub fn fp_state_mut(&mut self) -> &mut Option<FpState> { 234 &mut self.fp_state 235 } 236 237 /// ### 克隆ArchPCBInfo,需要注意gsdata也是对应clone的 238 pub fn clone_all(&self) -> Self { 239 Self { 240 rflags: self.rflags, 241 rbx: self.rbx, 242 r12: self.r12, 243 r13: self.r13, 244 r14: self.r14, 245 r15: self.r15, 246 rbp: self.rbp, 247 rsp: self.rsp, 248 rip: self.rip, 249 cr2: self.cr2, 250 fsbase: self.fsbase, 251 gsbase: self.gsbase, 252 fs: self.fs, 253 gs: self.gs, 254 gsdata: self.gsdata.clone(), 255 fp_state: self.fp_state, 256 } 257 } 258 259 // ### 从另一个ArchPCBInfo处clone,gsdata会被保留 260 pub fn clone_from(&mut self, from: &Self) { 261 let gsdata = self.gsdata.clone(); 262 *self = from.clone_all(); 263 self.gsdata = gsdata; 264 } 265 } 266 267 impl ProcessControlBlock { 268 /// 获取当前进程的pcb 269 pub fn arch_current_pcb() -> Arc<Self> { 270 // 获取栈指针 271 let ptr = VirtAddr::new(x86::current::registers::rsp() as usize); 272 273 let stack_base = VirtAddr::new(ptr.data() & (!(KernelStack::ALIGN - 1))); 274 275 // 从内核栈的最低地址处取出pcb的地址 276 let p = stack_base.data() as *const *const ProcessControlBlock; 277 if unlikely((unsafe { *p }).is_null()) { 278 error!("p={:p}", p); 279 panic!("current_pcb is null"); 280 } 281 unsafe { 282 // 为了防止内核栈的pcb weak 指针被释放,这里需要将其包装一下 283 let weak_wrapper: ManuallyDrop<Weak<ProcessControlBlock>> = 284 ManuallyDrop::new(Weak::from_raw(*p)); 285 286 let new_arc: Arc<ProcessControlBlock> = weak_wrapper.upgrade().unwrap(); 287 return new_arc; 288 } 289 } 290 } 291 292 impl ProcessManager { 293 pub fn arch_init() { 294 // do nothing 295 } 296 /// fork的过程中复制线程 297 /// 298 /// 由于这个过程与具体的架构相关,所以放在这里 299 pub fn copy_thread( 300 current_pcb: &Arc<ProcessControlBlock>, 301 new_pcb: &Arc<ProcessControlBlock>, 302 clone_args: &KernelCloneArgs, 303 current_trapframe: &TrapFrame, 304 ) -> Result<(), SystemError> { 305 let clone_flags = clone_args.flags; 306 let mut child_trapframe = *current_trapframe; 307 308 // 子进程的返回值为0 309 child_trapframe.set_return_value(0); 310 311 // 设置子进程的栈基址(开始执行中断返回流程时的栈基址) 312 let mut new_arch_guard = unsafe { new_pcb.arch_info() }; 313 let kernel_stack_guard = new_pcb.kernel_stack(); 314 315 // 设置子进程在内核态开始执行时的rsp、rbp 316 new_arch_guard.set_stack_base(kernel_stack_guard.stack_max_address()); 317 318 let trap_frame_vaddr: VirtAddr = 319 kernel_stack_guard.stack_max_address() - core::mem::size_of::<TrapFrame>(); 320 new_arch_guard.set_stack(trap_frame_vaddr); 321 322 // 拷贝栈帧 323 unsafe { 324 let usp = clone_args.stack; 325 if usp != 0 { 326 child_trapframe.rsp = usp as u64; 327 } 328 let trap_frame_ptr = trap_frame_vaddr.data() as *mut TrapFrame; 329 *trap_frame_ptr = child_trapframe; 330 } 331 332 let current_arch_guard = current_pcb.arch_info_irqsave(); 333 new_arch_guard.fsbase = current_arch_guard.fsbase; 334 new_arch_guard.gsbase = current_arch_guard.gsbase; 335 new_arch_guard.fs = current_arch_guard.fs; 336 new_arch_guard.gs = current_arch_guard.gs; 337 new_arch_guard.fp_state = current_arch_guard.fp_state; 338 339 // 拷贝浮点寄存器的状态 340 if let Some(fp_state) = current_arch_guard.fp_state.as_ref() { 341 new_arch_guard.fp_state = Some(*fp_state); 342 } 343 drop(current_arch_guard); 344 345 // 设置返回地址(子进程开始执行的指令地址) 346 if new_pcb.flags().contains(ProcessFlags::KTHREAD) { 347 let kthread_bootstrap_stage1_func_addr = kernel_thread_bootstrap_stage1 as usize; 348 new_arch_guard.rip = kthread_bootstrap_stage1_func_addr; 349 } else { 350 new_arch_guard.rip = ret_from_intr as usize; 351 } 352 353 // 设置tls 354 if clone_flags.contains(CloneFlags::CLONE_SETTLS) { 355 drop(new_arch_guard); 356 Syscall::do_arch_prctl_64(new_pcb, ARCH_SET_FS, clone_args.tls, true)?; 357 } 358 359 return Ok(()); 360 } 361 362 /// 切换进程 363 /// 364 /// ## 参数 365 /// 366 /// - `prev`:上一个进程的pcb 367 /// - `next`:下一个进程的pcb 368 pub unsafe fn switch_process(prev: Arc<ProcessControlBlock>, next: Arc<ProcessControlBlock>) { 369 assert!(!CurrentIrqArch::is_irq_enabled()); 370 371 // 保存浮点寄存器 372 prev.arch_info_irqsave().save_fp_state(); 373 // 切换浮点寄存器 374 next.arch_info_irqsave().restore_fp_state(); 375 376 // 切换fsbase 377 prev.arch_info_irqsave().save_fsbase(); 378 next.arch_info_irqsave().restore_fsbase(); 379 380 // 切换gsbase 381 Self::switch_gsbase(&prev, &next); 382 383 // 切换地址空间 384 let next_addr_space = next.basic().user_vm().as_ref().unwrap().clone(); 385 compiler_fence(Ordering::SeqCst); 386 387 next_addr_space.read().user_mapper.utable.make_current(); 388 drop(next_addr_space); 389 compiler_fence(Ordering::SeqCst); 390 // 切换内核栈 391 392 // 获取arch info的锁,并强制泄露其守卫(切换上下文后,在switch_finish_hook中会释放锁) 393 let next_arch = SpinLockGuard::leak(next.arch_info_irqsave()) as *mut ArchPCBInfo; 394 let prev_arch = SpinLockGuard::leak(prev.arch_info_irqsave()) as *mut ArchPCBInfo; 395 396 (*prev_arch).rip = switch_back as usize; 397 398 // 恢复当前的 preempt count*2 399 ProcessManager::current_pcb().preempt_enable(); 400 ProcessManager::current_pcb().preempt_enable(); 401 402 // 切换tss 403 TSSManager::current_tss().set_rsp( 404 x86::Ring::Ring0, 405 next.kernel_stack().stack_max_address().data() as u64, 406 ); 407 PROCESS_SWITCH_RESULT.as_mut().unwrap().get_mut().prev_pcb = Some(prev); 408 PROCESS_SWITCH_RESULT.as_mut().unwrap().get_mut().next_pcb = Some(next); 409 // debug!("switch tss ok"); 410 compiler_fence(Ordering::SeqCst); 411 // 正式切换上下文 412 switch_to_inner(prev_arch, next_arch); 413 } 414 415 unsafe fn switch_gsbase(prev: &Arc<ProcessControlBlock>, next: &Arc<ProcessControlBlock>) { 416 asm!("swapgs", options(nostack, preserves_flags)); 417 prev.arch_info_irqsave().save_gsbase(); 418 next.arch_info_irqsave().restore_gsbase(); 419 // 将下一个进程的kstack写入kernel_gsbase 420 next.arch_info_irqsave().store_kernel_gsbase(); 421 asm!("swapgs", options(nostack, preserves_flags)); 422 } 423 } 424 425 /// 保存上下文,然后切换进程,接着jmp到`switch_finish_hook`钩子函数 426 #[naked] 427 unsafe extern "sysv64" fn switch_to_inner(prev: *mut ArchPCBInfo, next: *mut ArchPCBInfo) { 428 asm!( 429 // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"): 430 // 431 // - the current parameters are passed in the registers `rdi`, `rsi`, 432 // - we can modify scratch registers, e.g. rax 433 // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we 434 // store them here in the first place. 435 concat!(" 436 // Save old registers, and load new ones 437 mov [rdi + {off_rbx}], rbx 438 mov rbx, [rsi + {off_rbx}] 439 440 mov [rdi + {off_r12}], r12 441 mov r12, [rsi + {off_r12}] 442 443 mov [rdi + {off_r13}], r13 444 mov r13, [rsi + {off_r13}] 445 446 mov [rdi + {off_r14}], r14 447 mov r14, [rsi + {off_r14}] 448 449 mov [rdi + {off_r15}], r15 450 mov r15, [rsi + {off_r15}] 451 452 // switch segment registers (这些寄存器只能通过接下来的switch_hook的return来切换) 453 mov [rdi + {off_fs}], fs 454 mov [rdi + {off_gs}], gs 455 456 // mov fs, [rsi + {off_fs}] 457 // mov gs, [rsi + {off_gs}] 458 459 mov [rdi + {off_rbp}], rbp 460 mov rbp, [rsi + {off_rbp}] 461 462 mov [rdi + {off_rsp}], rsp 463 mov rsp, [rsi + {off_rsp}] 464 465 // // push RFLAGS (can only be modified via stack) 466 pushfq 467 // // pop RFLAGS into `self.rflags` 468 pop QWORD PTR [rdi + {off_rflags}] 469 470 // // push `next.rflags` 471 push QWORD PTR [rsi + {off_rflags}] 472 // // pop into RFLAGS 473 popfq 474 475 // push next rip to stack 476 push QWORD PTR [rsi + {off_rip}] 477 478 479 // When we return, we cannot even guarantee that the return address on the stack, points to 480 // the calling function. Thus, we have to execute this Rust hook by 481 // ourselves, which will unlock the contexts before the later switch. 482 483 // Note that switch_finish_hook will be responsible for executing `ret`. 484 jmp {switch_hook} 485 "), 486 487 off_rflags = const(offset_of!(ArchPCBInfo, rflags)), 488 489 off_rbx = const(offset_of!(ArchPCBInfo, rbx)), 490 off_r12 = const(offset_of!(ArchPCBInfo, r12)), 491 off_r13 = const(offset_of!(ArchPCBInfo, r13)), 492 off_r14 = const(offset_of!(ArchPCBInfo, r14)), 493 off_rbp = const(offset_of!(ArchPCBInfo, rbp)), 494 off_rsp = const(offset_of!(ArchPCBInfo, rsp)), 495 off_r15 = const(offset_of!(ArchPCBInfo, r15)), 496 off_rip = const(offset_of!(ArchPCBInfo, rip)), 497 off_fs = const(offset_of!(ArchPCBInfo, fs)), 498 off_gs = const(offset_of!(ArchPCBInfo, gs)), 499 500 switch_hook = sym crate::process::switch_finish_hook, 501 options(noreturn), 502 ); 503 } 504 505 #[naked] 506 unsafe extern "sysv64" fn switch_back() -> ! { 507 asm!("ret", options(noreturn)); 508 } 509 510 pub unsafe fn arch_switch_to_user(trap_frame: TrapFrame) -> ! { 511 // 以下代码不能发生中断 512 CurrentIrqArch::interrupt_disable(); 513 514 let current_pcb = ProcessManager::current_pcb(); 515 let trap_frame_vaddr = VirtAddr::new( 516 current_pcb.kernel_stack().stack_max_address().data() - core::mem::size_of::<TrapFrame>(), 517 ); 518 // debug!("trap_frame_vaddr: {:?}", trap_frame_vaddr); 519 520 assert!( 521 (x86::current::registers::rsp() as usize) < trap_frame_vaddr.data(), 522 "arch_switch_to_user(): current_rsp >= fake trap 523 frame vaddr, this may cause some illegal access to memory! 524 rsp: {:#x}, trap_frame_vaddr: {:#x}", 525 x86::current::registers::rsp() as usize, 526 trap_frame_vaddr.data() 527 ); 528 529 let new_rip = VirtAddr::new(ret_from_intr as usize); 530 let mut arch_guard = current_pcb.arch_info_irqsave(); 531 arch_guard.rsp = trap_frame_vaddr.data(); 532 533 arch_guard.fs = USER_DS; 534 arch_guard.gs = USER_DS; 535 536 // 将内核gs数据压进cpu 537 arch_guard.store_kernel_gsbase(); 538 539 switch_fs_and_gs( 540 SegmentSelector::from_bits_truncate(arch_guard.fs.bits()), 541 SegmentSelector::from_bits_truncate(arch_guard.gs.bits()), 542 ); 543 arch_guard.rip = new_rip.data(); 544 545 drop(arch_guard); 546 547 drop(current_pcb); 548 compiler_fence(Ordering::SeqCst); 549 550 // 重要!在这里之后,一定要保证上面的引用计数变量、动态申请的变量、锁的守卫都被drop了,否则可能导致内存安全问题! 551 552 compiler_fence(Ordering::SeqCst); 553 ready_to_switch_to_user(trap_frame, trap_frame_vaddr.data(), new_rip.data()); 554 } 555 556 /// 由于需要依赖ret来切换到用户态,所以不能inline 557 #[inline(never)] 558 unsafe extern "sysv64" fn ready_to_switch_to_user( 559 trap_frame: TrapFrame, 560 trapframe_vaddr: usize, 561 new_rip: usize, 562 ) -> ! { 563 *(trapframe_vaddr as *mut TrapFrame) = trap_frame; 564 compiler_fence(Ordering::SeqCst); 565 asm!( 566 "swapgs", 567 "mov rsp, {trapframe_vaddr}", 568 "push {new_rip}", 569 "ret", 570 trapframe_vaddr = in(reg) trapframe_vaddr, 571 new_rip = in(reg) new_rip 572 ); 573 unreachable!() 574 } 575 576 // bitflags! { 577 // pub struct ProcessThreadFlags: u32 { 578 // /* 579 // * thread information flags 580 // * - these are process state flags that various assembly files 581 // * may need to access 582 // */ 583 // const TIF_NOTIFY_RESUME = 1 << 1; /* callback before returning to user */ 584 // const TIF_SIGPENDING = 1 << 2; /* signal pending */ 585 // const TIF_NEED_RESCHED = 1 << 3; /* rescheduling necessary */ 586 // const TIF_SINGLESTEP = 1 << 4; /* reenable singlestep on user return*/ 587 // const TIF_SSBD = 1 << 5; /* Speculative store bypass disable */ 588 // const TIF_SPEC_IB = 1 << 9; /* Indirect branch speculation mitigation */ 589 // const TIF_SPEC_L1D_FLUSH = 1 << 10; /* Flush L1D on mm switches (processes) */ 590 // const TIF_USER_RETURN_NOTIFY = 1 << 11; /* notify kernel of userspace return */ 591 // const TIF_UPROBE = 1 << 12; /* breakpointed or singlestepping */ 592 // const TIF_PATCH_PENDING = 1 << 13; /* pending live patching update */ 593 // const TIF_NEED_FPU_LOAD = 1 << 14; /* load FPU on return to userspace */ 594 // const TIF_NOCPUID = 1 << 15; /* CPUID is not accessible in userland */ 595 // const TIF_NOTSC = 1 << 16; /* TSC is not accessible in userland */ 596 // const TIF_NOTIFY_SIGNAL = 1 << 17; /* signal notifications exist */ 597 // const TIF_MEMDIE = 1 << 20; /* is terminating due to OOM killer */ 598 // const TIF_POLLING_NRFLAG = 1 << 21; /* idle is polling for TIF_NEED_RESCHED */ 599 // const TIF_IO_BITMAP = 1 << 22; /* uses I/O bitmap */ 600 // const TIF_SPEC_FORCE_UPDATE = 1 << 23; /* Force speculation MSR update in context switch */ 601 // const TIF_FORCED_TF = 1 << 24; /* true if TF in eflags artificially */ 602 // const TIF_BLOCKSTEP = 1 << 25; /* set when we want DEBUGCTLMSR_BTF */ 603 // const TIF_LAZY_MMU_UPDATES = 1 << 27; /* task is updating the mmu lazily */ 604 // const TIF_ADDR32 = 1 << 29; /* 32-bit address space on 64 bits */ 605 // } 606 // } 607