xref: /DragonOS/kernel/src/arch/x86_64/process/mod.rs (revision b5b571e02693d91eb6918d3b7561e088c3e7ee81)
1 use core::{
2     arch::asm,
3     intrinsics::unlikely,
4     mem::ManuallyDrop,
5     sync::atomic::{compiler_fence, Ordering},
6 };
7 
8 use alloc::{
9     string::String,
10     sync::{Arc, Weak},
11     vec::Vec,
12 };
13 
14 use kdepends::memoffset::offset_of;
15 use system_error::SystemError;
16 use x86::{controlregs::Cr4, segmentation::SegmentSelector};
17 
18 use crate::{
19     arch::process::table::TSSManager,
20     exception::InterruptArch,
21     kerror, kwarn,
22     libs::spinlock::SpinLockGuard,
23     mm::{
24         percpu::{PerCpu, PerCpuVar},
25         VirtAddr,
26     },
27     process::{
28         fork::{CloneFlags, KernelCloneArgs},
29         KernelStack, ProcessControlBlock, ProcessFlags, ProcessManager, SwitchResult,
30         SWITCH_RESULT,
31     },
32     syscall::Syscall,
33 };
34 
35 use self::{
36     kthread::kernel_thread_bootstrap_stage1,
37     syscall::ARCH_SET_FS,
38     table::{switch_fs_and_gs, KERNEL_DS, USER_DS},
39 };
40 
41 use super::{fpu::FpState, interrupt::TrapFrame, syscall::X86_64GSData, CurrentIrqArch};
42 
43 pub mod idle;
44 pub mod kthread;
45 pub mod syscall;
46 pub mod table;
47 
48 extern "C" {
49     /// 从中断返回
50     fn ret_from_intr();
51 }
52 
53 #[allow(dead_code)]
54 #[repr(align(32768))]
55 union InitProcUnion {
56     /// 用于存放idle进程的内核栈
57     idle_stack: [u8; 32768],
58 }
59 
60 #[link_section = ".data.init_proc_union"]
61 #[no_mangle]
62 static BSP_IDLE_STACK_SPACE: InitProcUnion = InitProcUnion {
63     idle_stack: [0; 32768],
64 };
65 
66 /// PCB中与架构相关的信息
67 #[derive(Debug)]
68 #[allow(dead_code)]
69 pub struct ArchPCBInfo {
70     rflags: usize,
71     rbx: usize,
72     r12: usize,
73     r13: usize,
74     r14: usize,
75     r15: usize,
76     rbp: usize,
77     rsp: usize,
78     rip: usize,
79     cr2: usize,
80     fsbase: usize,
81     gsbase: usize,
82     fs: SegmentSelector,
83     gs: SegmentSelector,
84     /// 存储PCB系统调用栈以及在syscall过程中暂存用户态rsp的结构体
85     gsdata: X86_64GSData,
86     /// 浮点寄存器的状态
87     fp_state: Option<FpState>,
88 }
89 
90 #[allow(dead_code)]
91 impl ArchPCBInfo {
92     /// 创建一个新的ArchPCBInfo
93     ///
94     /// ## 参数
95     ///
96     /// - `kstack`:内核栈的引用,如果为None,则不会设置rsp和rbp。如果为Some,则会设置rsp和rbp为内核栈的最高地址。
97     ///
98     /// ## 返回值
99     ///
100     /// 返回一个新的ArchPCBInfo
101     #[inline(never)]
102     pub fn new(kstack: &KernelStack) -> Self {
103         let mut r = Self {
104             rflags: 0,
105             rbx: 0,
106             r12: 0,
107             r13: 0,
108             r14: 0,
109             r15: 0,
110             rbp: 0,
111             rsp: 0,
112             rip: 0,
113             cr2: 0,
114             fsbase: 0,
115             gsbase: 0,
116             gsdata: X86_64GSData {
117                 kaddr: VirtAddr::new(0),
118                 uaddr: VirtAddr::new(0),
119             },
120             fs: KERNEL_DS,
121             gs: KERNEL_DS,
122             fp_state: None,
123         };
124 
125         r.rsp = kstack.stack_max_address().data() - 8;
126         r.rbp = kstack.stack_max_address().data();
127 
128         return r;
129     }
130 
131     pub fn set_stack(&mut self, stack: VirtAddr) {
132         self.rsp = stack.data();
133     }
134 
135     pub fn set_stack_base(&mut self, stack_base: VirtAddr) {
136         self.rbp = stack_base.data();
137     }
138 
139     pub fn rbp(&self) -> usize {
140         self.rbp
141     }
142 
143     pub unsafe fn push_to_stack(&mut self, value: usize) {
144         self.rsp -= core::mem::size_of::<usize>();
145         *(self.rsp as *mut usize) = value;
146     }
147 
148     pub unsafe fn pop_from_stack(&mut self) -> usize {
149         let value = *(self.rsp as *const usize);
150         self.rsp += core::mem::size_of::<usize>();
151         value
152     }
153 
154     pub fn save_fp_state(&mut self) {
155         if self.fp_state.is_none() {
156             self.fp_state = Some(FpState::new());
157         }
158 
159         self.fp_state.as_mut().unwrap().save();
160     }
161 
162     pub fn restore_fp_state(&mut self) {
163         if unlikely(self.fp_state.is_none()) {
164             return;
165         }
166 
167         self.fp_state.as_mut().unwrap().restore();
168     }
169 
170     /// 返回浮点寄存器结构体的副本
171     pub fn fp_state(&self) -> &Option<FpState> {
172         &self.fp_state
173     }
174 
175     // 清空浮点寄存器
176     pub fn clear_fp_state(&mut self) {
177         if unlikely(self.fp_state.is_none()) {
178             kwarn!("fp_state is none");
179             return;
180         }
181 
182         self.fp_state.as_mut().unwrap().clear();
183     }
184     pub unsafe fn save_fsbase(&mut self) {
185         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
186             self.fsbase = x86::current::segmentation::rdfsbase() as usize;
187         } else {
188             self.fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE) as usize;
189         }
190     }
191 
192     pub unsafe fn save_gsbase(&mut self) {
193         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
194             self.gsbase = x86::current::segmentation::rdgsbase() as usize;
195         } else {
196             self.gsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE) as usize;
197         }
198     }
199 
200     pub unsafe fn restore_fsbase(&mut self) {
201         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
202             x86::current::segmentation::wrfsbase(self.fsbase as u64);
203         } else {
204             x86::msr::wrmsr(x86::msr::IA32_FS_BASE, self.fsbase as u64);
205         }
206     }
207 
208     pub unsafe fn restore_gsbase(&mut self) {
209         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
210             x86::current::segmentation::wrgsbase(self.gsbase as u64);
211         } else {
212             x86::msr::wrmsr(x86::msr::IA32_GS_BASE, self.gsbase as u64);
213         }
214     }
215 
216     /// 将gsdata写入KernelGsbase寄存器
217     pub unsafe fn store_kernel_gsbase(&self) {
218         x86::msr::wrmsr(
219             x86::msr::IA32_KERNEL_GSBASE,
220             &self.gsdata as *const X86_64GSData as u64,
221         );
222     }
223 
224     /// ### 初始化系统调用栈,不得与PCB内核栈冲突(即传入的应该是一个新的栈,避免栈损坏)
225     pub fn init_syscall_stack(&mut self, stack: &KernelStack) {
226         self.gsdata.set_kstack(stack.stack_max_address() - 8);
227     }
228 
229     pub fn fsbase(&self) -> usize {
230         self.fsbase
231     }
232 
233     pub fn gsbase(&self) -> usize {
234         self.gsbase
235     }
236 
237     pub fn cr2_mut(&mut self) -> &mut usize {
238         &mut self.cr2
239     }
240 
241     pub fn fp_state_mut(&mut self) -> &mut Option<FpState> {
242         &mut self.fp_state
243     }
244 
245     /// ### 克隆ArchPCBInfo,需要注意gsdata也是对应clone的
246     pub fn clone_all(&self) -> Self {
247         Self {
248             rflags: self.rflags,
249             rbx: self.rbx,
250             r12: self.r12,
251             r13: self.r13,
252             r14: self.r14,
253             r15: self.r15,
254             rbp: self.rbp,
255             rsp: self.rsp,
256             rip: self.rip,
257             cr2: self.cr2,
258             fsbase: self.fsbase,
259             gsbase: self.gsbase,
260             fs: self.fs,
261             gs: self.gs,
262             gsdata: self.gsdata.clone(),
263             fp_state: self.fp_state,
264         }
265     }
266 
267     // ### 从另一个ArchPCBInfo处clone,gsdata会被保留
268     pub fn clone_from(&mut self, from: &Self) {
269         let gsdata = self.gsdata.clone();
270         *self = from.clone_all();
271         self.gsdata = gsdata;
272     }
273 }
274 
275 impl ProcessControlBlock {
276     /// 获取当前进程的pcb
277     pub fn arch_current_pcb() -> Arc<Self> {
278         // 获取栈指针
279         let ptr = VirtAddr::new(x86::current::registers::rsp() as usize);
280 
281         let stack_base = VirtAddr::new(ptr.data() & (!(KernelStack::ALIGN - 1)));
282 
283         // 从内核栈的最低地址处取出pcb的地址
284         let p = stack_base.data() as *const *const ProcessControlBlock;
285         if unlikely((unsafe { *p }).is_null()) {
286             kerror!("p={:p}", p);
287             panic!("current_pcb is null");
288         }
289         unsafe {
290             // 为了防止内核栈的pcb weak 指针被释放,这里需要将其包装一下
291             let weak_wrapper: ManuallyDrop<Weak<ProcessControlBlock>> =
292                 ManuallyDrop::new(Weak::from_raw(*p));
293 
294             let new_arc: Arc<ProcessControlBlock> = weak_wrapper.upgrade().unwrap();
295             return new_arc;
296         }
297     }
298 }
299 
300 impl ProcessManager {
301     pub fn arch_init() {
302         {
303             // 初始化进程切换结果 per cpu变量
304             let mut switch_res_vec: Vec<SwitchResult> = Vec::new();
305             for _ in 0..PerCpu::MAX_CPU_NUM {
306                 switch_res_vec.push(SwitchResult::new());
307             }
308             unsafe {
309                 SWITCH_RESULT = Some(PerCpuVar::new(switch_res_vec).unwrap());
310             }
311         }
312     }
313     /// fork的过程中复制线程
314     ///
315     /// 由于这个过程与具体的架构相关,所以放在这里
316     pub fn copy_thread(
317         current_pcb: &Arc<ProcessControlBlock>,
318         new_pcb: &Arc<ProcessControlBlock>,
319         clone_args: KernelCloneArgs,
320         current_trapframe: &TrapFrame,
321     ) -> Result<(), SystemError> {
322         let clone_flags = clone_args.flags;
323         let mut child_trapframe = *current_trapframe;
324 
325         // 子进程的返回值为0
326         child_trapframe.set_return_value(0);
327 
328         // 设置子进程的栈基址(开始执行中断返回流程时的栈基址)
329         let mut new_arch_guard = unsafe { new_pcb.arch_info() };
330         let kernel_stack_guard = new_pcb.kernel_stack();
331 
332         // 设置子进程在内核态开始执行时的rsp、rbp
333         new_arch_guard.set_stack_base(kernel_stack_guard.stack_max_address());
334 
335         let trap_frame_vaddr: VirtAddr =
336             kernel_stack_guard.stack_max_address() - core::mem::size_of::<TrapFrame>();
337         new_arch_guard.set_stack(trap_frame_vaddr);
338 
339         // 拷贝栈帧
340         unsafe {
341             let usp = clone_args.stack;
342             if usp != 0 {
343                 child_trapframe.rsp = usp as u64;
344             }
345             let trap_frame_ptr = trap_frame_vaddr.data() as *mut TrapFrame;
346             *trap_frame_ptr = child_trapframe;
347         }
348 
349         let current_arch_guard = current_pcb.arch_info_irqsave();
350         new_arch_guard.fsbase = current_arch_guard.fsbase;
351         new_arch_guard.gsbase = current_arch_guard.gsbase;
352         new_arch_guard.fs = current_arch_guard.fs;
353         new_arch_guard.gs = current_arch_guard.gs;
354         new_arch_guard.fp_state = current_arch_guard.fp_state;
355 
356         // 拷贝浮点寄存器的状态
357         if let Some(fp_state) = current_arch_guard.fp_state.as_ref() {
358             new_arch_guard.fp_state = Some(*fp_state);
359         }
360         drop(current_arch_guard);
361 
362         // 设置返回地址(子进程开始执行的指令地址)
363         if new_pcb.flags().contains(ProcessFlags::KTHREAD) {
364             let kthread_bootstrap_stage1_func_addr = kernel_thread_bootstrap_stage1 as usize;
365             new_arch_guard.rip = kthread_bootstrap_stage1_func_addr;
366         } else {
367             new_arch_guard.rip = ret_from_intr as usize;
368         }
369 
370         // 设置tls
371         if clone_flags.contains(CloneFlags::CLONE_SETTLS) {
372             drop(new_arch_guard);
373             Syscall::do_arch_prctl_64(new_pcb, ARCH_SET_FS, clone_args.tls, true)?;
374         }
375 
376         return Ok(());
377     }
378 
379     /// 切换进程
380     ///
381     /// ## 参数
382     ///
383     /// - `prev`:上一个进程的pcb
384     /// - `next`:下一个进程的pcb
385     pub unsafe fn switch_process(prev: Arc<ProcessControlBlock>, next: Arc<ProcessControlBlock>) {
386         assert!(!CurrentIrqArch::is_irq_enabled());
387 
388         // 保存浮点寄存器
389         prev.arch_info_irqsave().save_fp_state();
390         // 切换浮点寄存器
391         next.arch_info_irqsave().restore_fp_state();
392 
393         // 切换fsbase
394         prev.arch_info_irqsave().save_fsbase();
395         next.arch_info_irqsave().restore_fsbase();
396 
397         // 切换gsbase
398         Self::switch_gsbase(&prev, &next);
399 
400         // 切换地址空间
401         let next_addr_space = next.basic().user_vm().as_ref().unwrap().clone();
402         compiler_fence(Ordering::SeqCst);
403 
404         next_addr_space.read().user_mapper.utable.make_current();
405         drop(next_addr_space);
406         compiler_fence(Ordering::SeqCst);
407         // 切换内核栈
408 
409         // 获取arch info的锁,并强制泄露其守卫(切换上下文后,在switch_finish_hook中会释放锁)
410         let next_arch = SpinLockGuard::leak(next.arch_info_irqsave()) as *mut ArchPCBInfo;
411         let prev_arch = SpinLockGuard::leak(prev.arch_info_irqsave()) as *mut ArchPCBInfo;
412 
413         (*prev_arch).rip = switch_back as usize;
414 
415         // 恢复当前的 preempt count*2
416         ProcessManager::current_pcb().preempt_enable();
417         ProcessManager::current_pcb().preempt_enable();
418 
419         // 切换tss
420         TSSManager::current_tss().set_rsp(
421             x86::Ring::Ring0,
422             next.kernel_stack().stack_max_address().data() as u64,
423         );
424         SWITCH_RESULT.as_mut().unwrap().get_mut().prev_pcb = Some(prev);
425         SWITCH_RESULT.as_mut().unwrap().get_mut().next_pcb = Some(next);
426         // kdebug!("switch tss ok");
427         compiler_fence(Ordering::SeqCst);
428         // 正式切换上下文
429         switch_to_inner(prev_arch, next_arch);
430     }
431 
432     unsafe fn switch_gsbase(prev: &Arc<ProcessControlBlock>, next: &Arc<ProcessControlBlock>) {
433         asm!("swapgs", options(nostack, preserves_flags));
434         prev.arch_info_irqsave().save_gsbase();
435         next.arch_info_irqsave().restore_gsbase();
436         // 将下一个进程的kstack写入kernel_gsbase
437         next.arch_info_irqsave().store_kernel_gsbase();
438         asm!("swapgs", options(nostack, preserves_flags));
439     }
440 }
441 
442 /// 保存上下文,然后切换进程,接着jmp到`switch_finish_hook`钩子函数
443 #[naked]
444 unsafe extern "sysv64" fn switch_to_inner(prev: *mut ArchPCBInfo, next: *mut ArchPCBInfo) {
445     asm!(
446         // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"):
447         //
448         // - the current parameters are passed in the registers `rdi`, `rsi`,
449         // - we can modify scratch registers, e.g. rax
450         // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we
451         //   store them here in the first place.
452         concat!("
453         // Save old registers, and load new ones
454         mov [rdi + {off_rbx}], rbx
455         mov rbx, [rsi + {off_rbx}]
456 
457         mov [rdi + {off_r12}], r12
458         mov r12, [rsi + {off_r12}]
459 
460         mov [rdi + {off_r13}], r13
461         mov r13, [rsi + {off_r13}]
462 
463         mov [rdi + {off_r14}], r14
464         mov r14, [rsi + {off_r14}]
465 
466         mov [rdi + {off_r15}], r15
467         mov r15, [rsi + {off_r15}]
468 
469         // switch segment registers (这些寄存器只能通过接下来的switch_hook的return来切换)
470         mov [rdi + {off_fs}], fs
471         mov [rdi + {off_gs}], gs
472 
473         // mov fs, [rsi + {off_fs}]
474         // mov gs, [rsi + {off_gs}]
475 
476         push rbp
477         push rax
478 
479         mov [rdi + {off_rbp}], rbp
480         mov rbp, [rsi + {off_rbp}]
481 
482         mov [rdi + {off_rsp}], rsp
483         mov rsp, [rsi + {off_rsp}]
484 
485         // // push RFLAGS (can only be modified via stack)
486         pushfq
487         // // pop RFLAGS into `self.rflags`
488         pop QWORD PTR [rdi + {off_rflags}]
489 
490         // // push `next.rflags`
491         push QWORD PTR [rsi + {off_rflags}]
492         // // pop into RFLAGS
493         popfq
494 
495         // push next rip to stack
496         push QWORD PTR [rsi + {off_rip}]
497 
498 
499         // When we return, we cannot even guarantee that the return address on the stack, points to
500         // the calling function. Thus, we have to execute this Rust hook by
501         // ourselves, which will unlock the contexts before the later switch.
502 
503         // Note that switch_finish_hook will be responsible for executing `ret`.
504         jmp {switch_hook}
505         "),
506 
507         off_rflags = const(offset_of!(ArchPCBInfo, rflags)),
508 
509         off_rbx = const(offset_of!(ArchPCBInfo, rbx)),
510         off_r12 = const(offset_of!(ArchPCBInfo, r12)),
511         off_r13 = const(offset_of!(ArchPCBInfo, r13)),
512         off_r14 = const(offset_of!(ArchPCBInfo, r14)),
513         off_rbp = const(offset_of!(ArchPCBInfo, rbp)),
514         off_rsp = const(offset_of!(ArchPCBInfo, rsp)),
515         off_r15 = const(offset_of!(ArchPCBInfo, r15)),
516         off_rip = const(offset_of!(ArchPCBInfo, rip)),
517         off_fs = const(offset_of!(ArchPCBInfo, fs)),
518         off_gs = const(offset_of!(ArchPCBInfo, gs)),
519 
520         switch_hook = sym crate::process::switch_finish_hook,
521         options(noreturn),
522     );
523 }
524 
525 /// 从`switch_to_inner`返回后,执行这个函数
526 ///
527 /// 也就是说,当进程再次被调度时,会从这里开始执行
528 #[inline(never)]
529 unsafe extern "sysv64" fn switch_back() {
530     asm!(concat!(
531         "
532         pop rax
533         pop rbp
534         "
535     ))
536 }
537 
538 pub unsafe fn arch_switch_to_user(path: String, argv: Vec<String>, envp: Vec<String>) -> ! {
539     // 以下代码不能发生中断
540     CurrentIrqArch::interrupt_disable();
541 
542     let current_pcb = ProcessManager::current_pcb();
543     let trap_frame_vaddr = VirtAddr::new(
544         current_pcb.kernel_stack().stack_max_address().data() - core::mem::size_of::<TrapFrame>(),
545     );
546     // kdebug!("trap_frame_vaddr: {:?}", trap_frame_vaddr);
547     let new_rip = VirtAddr::new(ret_from_intr as usize);
548 
549     assert!(
550         (x86::current::registers::rsp() as usize) < trap_frame_vaddr.data(),
551         "arch_switch_to_user(): current_rsp >= fake trap
552         frame vaddr, this may cause some illegal access to memory!
553         rsp: {:#x}, trap_frame_vaddr: {:#x}",
554         x86::current::registers::rsp() as usize,
555         trap_frame_vaddr.data()
556     );
557 
558     let mut arch_guard = current_pcb.arch_info_irqsave();
559     arch_guard.rsp = trap_frame_vaddr.data();
560 
561     arch_guard.fs = USER_DS;
562     arch_guard.gs = USER_DS;
563 
564     // 将内核gs数据压进cpu
565     arch_guard.store_kernel_gsbase();
566 
567     switch_fs_and_gs(
568         SegmentSelector::from_bits_truncate(arch_guard.fs.bits()),
569         SegmentSelector::from_bits_truncate(arch_guard.gs.bits()),
570     );
571     arch_guard.rip = new_rip.data();
572 
573     drop(arch_guard);
574 
575     // 删除kthread的标志
576     current_pcb.flags().remove(ProcessFlags::KTHREAD);
577     current_pcb.worker_private().take();
578 
579     let mut trap_frame = TrapFrame::new();
580 
581     compiler_fence(Ordering::SeqCst);
582     Syscall::do_execve(path, argv, envp, &mut trap_frame).unwrap_or_else(|e| {
583         panic!(
584             "arch_switch_to_user(): pid: {pid:?}, Failed to execve: , error: {e:?}",
585             pid = current_pcb.pid(),
586             e = e
587         );
588     });
589     compiler_fence(Ordering::SeqCst);
590 
591     // 重要!在这里之后,一定要保证上面的引用计数变量、动态申请的变量、锁的守卫都被drop了,否则可能导致内存安全问题!
592 
593     drop(current_pcb);
594 
595     compiler_fence(Ordering::SeqCst);
596     ready_to_switch_to_user(trap_frame, trap_frame_vaddr.data(), new_rip.data());
597 }
598 
599 /// 由于需要依赖ret来切换到用户态,所以不能inline
600 #[inline(never)]
601 unsafe extern "sysv64" fn ready_to_switch_to_user(
602     trap_frame: TrapFrame,
603     trapframe_vaddr: usize,
604     new_rip: usize,
605 ) -> ! {
606     *(trapframe_vaddr as *mut TrapFrame) = trap_frame;
607     asm!(
608         "swapgs",
609         "mov rsp, {trapframe_vaddr}",
610         "push {new_rip}",
611         "ret",
612         trapframe_vaddr = in(reg) trapframe_vaddr,
613         new_rip = in(reg) new_rip
614     );
615     unreachable!()
616 }
617