xref: /DragonOS/kernel/src/arch/x86_64/process/mod.rs (revision 83ed0ebc293d5a10245089f627f52770fd5b9dd4)
1 use core::{
2     arch::asm,
3     intrinsics::unlikely,
4     mem::ManuallyDrop,
5     sync::atomic::{compiler_fence, Ordering},
6 };
7 
8 use alloc::{
9     string::String,
10     sync::{Arc, Weak},
11     vec::Vec,
12 };
13 
14 use kdepends::memoffset::offset_of;
15 use x86::{controlregs::Cr4, segmentation::SegmentSelector};
16 
17 use crate::{
18     arch::process::table::TSSManager,
19     exception::InterruptArch,
20     kerror, kwarn,
21     libs::spinlock::SpinLockGuard,
22     mm::{
23         percpu::{PerCpu, PerCpuVar},
24         VirtAddr,
25     },
26     process::{
27         fork::{CloneFlags, KernelCloneArgs},
28         KernelStack, ProcessControlBlock, ProcessFlags, ProcessManager, SwitchResult,
29         SWITCH_RESULT,
30     },
31     syscall::{Syscall, SystemError},
32 };
33 
34 use self::{
35     kthread::kernel_thread_bootstrap_stage1,
36     syscall::ARCH_SET_FS,
37     table::{switch_fs_and_gs, KERNEL_DS, USER_DS},
38 };
39 
40 use super::{fpu::FpState, interrupt::TrapFrame, syscall::X86_64GSData, CurrentIrqArch};
41 
42 mod c_adapter;
43 pub mod kthread;
44 pub mod syscall;
45 pub mod table;
46 
47 extern "C" {
48     /// 从中断返回
49     fn ret_from_intr();
50 }
51 
52 #[allow(dead_code)]
53 #[repr(align(32768))]
54 union InitProcUnion {
55     /// 用于存放idle进程的内核栈
56     idle_stack: [u8; 32768],
57 }
58 
59 #[link_section = ".data.init_proc_union"]
60 #[no_mangle]
61 static BSP_IDLE_STACK_SPACE: InitProcUnion = InitProcUnion {
62     idle_stack: [0; 32768],
63 };
64 
65 /// PCB中与架构相关的信息
66 #[derive(Debug)]
67 #[allow(dead_code)]
68 pub struct ArchPCBInfo {
69     rflags: usize,
70     rbx: usize,
71     r12: usize,
72     r13: usize,
73     r14: usize,
74     r15: usize,
75     rbp: usize,
76     rsp: usize,
77     rip: usize,
78     cr2: usize,
79     fsbase: usize,
80     gsbase: usize,
81     fs: SegmentSelector,
82     gs: SegmentSelector,
83     /// 存储PCB系统调用栈以及在syscall过程中暂存用户态rsp的结构体
84     gsdata: X86_64GSData,
85     /// 浮点寄存器的状态
86     fp_state: Option<FpState>,
87 }
88 
89 #[allow(dead_code)]
90 impl ArchPCBInfo {
91     /// 创建一个新的ArchPCBInfo
92     ///
93     /// ## 参数
94     ///
95     /// - `kstack`:内核栈的引用,如果为None,则不会设置rsp和rbp。如果为Some,则会设置rsp和rbp为内核栈的最高地址。
96     ///
97     /// ## 返回值
98     ///
99     /// 返回一个新的ArchPCBInfo
100     pub fn new(kstack: &KernelStack) -> Self {
101         let mut r = Self {
102             rflags: 0,
103             rbx: 0,
104             r12: 0,
105             r13: 0,
106             r14: 0,
107             r15: 0,
108             rbp: 0,
109             rsp: 0,
110             rip: 0,
111             cr2: 0,
112             fsbase: 0,
113             gsbase: 0,
114             gsdata: X86_64GSData {
115                 kaddr: VirtAddr::new(0),
116                 uaddr: VirtAddr::new(0),
117             },
118             fs: KERNEL_DS,
119             gs: KERNEL_DS,
120             fp_state: None,
121         };
122 
123         r.rsp = kstack.stack_max_address().data() - 8;
124         r.rbp = kstack.stack_max_address().data();
125 
126         return r;
127     }
128 
129     pub fn set_stack(&mut self, stack: VirtAddr) {
130         self.rsp = stack.data();
131     }
132 
133     pub fn set_stack_base(&mut self, stack_base: VirtAddr) {
134         self.rbp = stack_base.data();
135     }
136 
137     pub fn rbp(&self) -> usize {
138         self.rbp
139     }
140 
141     pub unsafe fn push_to_stack(&mut self, value: usize) {
142         self.rsp -= core::mem::size_of::<usize>();
143         *(self.rsp as *mut usize) = value;
144     }
145 
146     pub unsafe fn pop_from_stack(&mut self) -> usize {
147         let value = *(self.rsp as *const usize);
148         self.rsp += core::mem::size_of::<usize>();
149         value
150     }
151 
152     pub fn save_fp_state(&mut self) {
153         if self.fp_state.is_none() {
154             self.fp_state = Some(FpState::new());
155         }
156 
157         self.fp_state.as_mut().unwrap().save();
158     }
159 
160     pub fn restore_fp_state(&mut self) {
161         if unlikely(self.fp_state.is_none()) {
162             return;
163         }
164 
165         self.fp_state.as_mut().unwrap().restore();
166     }
167 
168     /// 返回浮点寄存器结构体的副本
169     pub fn fp_state(&self) -> &Option<FpState> {
170         &self.fp_state
171     }
172 
173     // 清空浮点寄存器
174     pub fn clear_fp_state(&mut self) {
175         if unlikely(self.fp_state.is_none()) {
176             kwarn!("fp_state is none");
177             return;
178         }
179 
180         self.fp_state.as_mut().unwrap().clear();
181     }
182     pub unsafe fn save_fsbase(&mut self) {
183         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
184             self.fsbase = x86::current::segmentation::rdfsbase() as usize;
185         } else {
186             self.fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE) as usize;
187         }
188     }
189 
190     pub unsafe fn save_gsbase(&mut self) {
191         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
192             self.gsbase = x86::current::segmentation::rdgsbase() as usize;
193         } else {
194             self.gsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE) as usize;
195         }
196     }
197 
198     pub unsafe fn restore_fsbase(&mut self) {
199         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
200             x86::current::segmentation::wrfsbase(self.fsbase as u64);
201         } else {
202             x86::msr::wrmsr(x86::msr::IA32_FS_BASE, self.fsbase as u64);
203         }
204     }
205 
206     pub unsafe fn restore_gsbase(&mut self) {
207         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
208             x86::current::segmentation::wrgsbase(self.gsbase as u64);
209         } else {
210             x86::msr::wrmsr(x86::msr::IA32_GS_BASE, self.gsbase as u64);
211         }
212     }
213 
214     /// 将gsdata写入KernelGsbase寄存器
215     pub unsafe fn store_kernel_gsbase(&self) {
216         x86::msr::wrmsr(
217             x86::msr::IA32_KERNEL_GSBASE,
218             &self.gsdata as *const X86_64GSData as u64,
219         );
220     }
221 
222     /// ### 初始化系统调用栈,不得与PCB内核栈冲突(即传入的应该是一个新的栈,避免栈损坏)
223     pub fn init_syscall_stack(&mut self, stack: &KernelStack) {
224         self.gsdata.set_kstack(stack.stack_max_address() - 8);
225     }
226 
227     pub fn fsbase(&self) -> usize {
228         self.fsbase
229     }
230 
231     pub fn gsbase(&self) -> usize {
232         self.gsbase
233     }
234 
235     pub fn cr2_mut(&mut self) -> &mut usize {
236         &mut self.cr2
237     }
238 
239     pub fn fp_state_mut(&mut self) -> &mut Option<FpState> {
240         &mut self.fp_state
241     }
242 
243     /// ### 克隆ArchPCBInfo,需要注意gsdata也是对应clone的
244     pub fn clone_all(&self) -> Self {
245         Self {
246             rflags: self.rflags,
247             rbx: self.rbx,
248             r12: self.r12,
249             r13: self.r13,
250             r14: self.r14,
251             r15: self.r15,
252             rbp: self.rbp,
253             rsp: self.rsp,
254             rip: self.rip,
255             cr2: self.cr2,
256             fsbase: self.fsbase,
257             gsbase: self.gsbase,
258             fs: self.fs.clone(),
259             gs: self.gs.clone(),
260             gsdata: self.gsdata.clone(),
261             fp_state: self.fp_state,
262         }
263     }
264 
265     // ### 从另一个ArchPCBInfo处clone,gsdata会被保留
266     pub fn clone_from(&mut self, from: &Self) {
267         let gsdata = self.gsdata.clone();
268         *self = from.clone_all();
269         self.gsdata = gsdata;
270     }
271 }
272 
273 impl ProcessControlBlock {
274     /// 获取当前进程的pcb
275     pub fn arch_current_pcb() -> Arc<Self> {
276         // 获取栈指针
277         let ptr = VirtAddr::new(x86::current::registers::rsp() as usize);
278 
279         let stack_base = VirtAddr::new(ptr.data() & (!(KernelStack::ALIGN - 1)));
280 
281         // 从内核栈的最低地址处取出pcb的地址
282         let p = stack_base.data() as *const *const ProcessControlBlock;
283         if unlikely((unsafe { *p }).is_null()) {
284             kerror!("p={:p}", p);
285             panic!("current_pcb is null");
286         }
287         unsafe {
288             // 为了防止内核栈的pcb weak 指针被释放,这里需要将其包装一下
289             let weak_wrapper: ManuallyDrop<Weak<ProcessControlBlock>> =
290                 ManuallyDrop::new(Weak::from_raw(*p));
291 
292             let new_arc: Arc<ProcessControlBlock> = weak_wrapper.upgrade().unwrap();
293             return new_arc;
294         }
295     }
296 }
297 
298 impl ProcessManager {
299     pub fn arch_init() {
300         {
301             // 初始化进程切换结果 per cpu变量
302             let mut switch_res_vec: Vec<SwitchResult> = Vec::new();
303             for _ in 0..PerCpu::MAX_CPU_NUM {
304                 switch_res_vec.push(SwitchResult::new());
305             }
306             unsafe {
307                 SWITCH_RESULT = Some(PerCpuVar::new(switch_res_vec).unwrap());
308             }
309         }
310     }
311     /// fork的过程中复制线程
312     ///
313     /// 由于这个过程与具体的架构相关,所以放在这里
314     pub fn copy_thread(
315         current_pcb: &Arc<ProcessControlBlock>,
316         new_pcb: &Arc<ProcessControlBlock>,
317         clone_args: KernelCloneArgs,
318         current_trapframe: &TrapFrame,
319     ) -> Result<(), SystemError> {
320         let clone_flags = clone_args.flags;
321         let mut child_trapframe = current_trapframe.clone();
322 
323         // 子进程的返回值为0
324         child_trapframe.set_return_value(0);
325 
326         // 设置子进程的栈基址(开始执行中断返回流程时的栈基址)
327         let mut new_arch_guard = new_pcb.arch_info();
328         let kernel_stack_guard = new_pcb.kernel_stack();
329 
330         // 设置子进程在内核态开始执行时的rsp、rbp
331         new_arch_guard.set_stack_base(kernel_stack_guard.stack_max_address());
332 
333         let trap_frame_vaddr: VirtAddr =
334             kernel_stack_guard.stack_max_address() - core::mem::size_of::<TrapFrame>();
335         new_arch_guard.set_stack(trap_frame_vaddr);
336 
337         // 拷贝栈帧
338         unsafe {
339             let usp = clone_args.stack;
340             if usp != 0 {
341                 child_trapframe.rsp = usp as u64;
342             }
343             let trap_frame_ptr = trap_frame_vaddr.data() as *mut TrapFrame;
344             *trap_frame_ptr = child_trapframe;
345         }
346 
347         let current_arch_guard = current_pcb.arch_info_irqsave();
348         new_arch_guard.fsbase = current_arch_guard.fsbase;
349         new_arch_guard.gsbase = current_arch_guard.gsbase;
350         new_arch_guard.fs = current_arch_guard.fs;
351         new_arch_guard.gs = current_arch_guard.gs;
352         new_arch_guard.fp_state = current_arch_guard.fp_state.clone();
353 
354         // 拷贝浮点寄存器的状态
355         if let Some(fp_state) = current_arch_guard.fp_state.as_ref() {
356             new_arch_guard.fp_state = Some(*fp_state);
357         }
358         drop(current_arch_guard);
359 
360         // 设置返回地址(子进程开始执行的指令地址)
361         if new_pcb.flags().contains(ProcessFlags::KTHREAD) {
362             let kthread_bootstrap_stage1_func_addr = kernel_thread_bootstrap_stage1 as usize;
363             new_arch_guard.rip = kthread_bootstrap_stage1_func_addr;
364         } else {
365             new_arch_guard.rip = ret_from_intr as usize;
366         }
367 
368         // 设置tls
369         if clone_flags.contains(CloneFlags::CLONE_SETTLS) {
370             drop(new_arch_guard);
371             Syscall::do_arch_prctl_64(new_pcb, ARCH_SET_FS, clone_args.tls, true)?;
372         }
373 
374         return Ok(());
375     }
376 
377     /// 切换进程
378     ///
379     /// ## 参数
380     ///
381     /// - `prev`:上一个进程的pcb
382     /// - `next`:下一个进程的pcb
383     pub unsafe fn switch_process(prev: Arc<ProcessControlBlock>, next: Arc<ProcessControlBlock>) {
384         assert!(CurrentIrqArch::is_irq_enabled() == false);
385 
386         // 保存浮点寄存器
387         prev.arch_info().save_fp_state();
388         // 切换浮点寄存器
389         next.arch_info().restore_fp_state();
390 
391         // 切换fsbase
392         prev.arch_info().save_fsbase();
393         next.arch_info().restore_fsbase();
394 
395         // 切换gsbase
396         Self::switch_gsbase(&prev, &next);
397 
398         // 切换地址空间
399         let next_addr_space = next.basic().user_vm().as_ref().unwrap().clone();
400         compiler_fence(Ordering::SeqCst);
401 
402         next_addr_space.read().user_mapper.utable.make_current();
403         drop(next_addr_space);
404         compiler_fence(Ordering::SeqCst);
405         // 切换内核栈
406 
407         // 获取arch info的锁,并强制泄露其守卫(切换上下文后,在switch_finish_hook中会释放锁)
408         let next_arch = SpinLockGuard::leak(next.arch_info()) as *mut ArchPCBInfo;
409         let prev_arch = SpinLockGuard::leak(prev.arch_info()) as *mut ArchPCBInfo;
410 
411         (*prev_arch).rip = switch_back as usize;
412 
413         // 恢复当前的 preempt count*2
414         ProcessManager::current_pcb().preempt_enable();
415         ProcessManager::current_pcb().preempt_enable();
416 
417         // 切换tss
418         TSSManager::current_tss().set_rsp(
419             x86::Ring::Ring0,
420             next.kernel_stack().stack_max_address().data() as u64,
421         );
422         SWITCH_RESULT.as_mut().unwrap().get_mut().prev_pcb = Some(prev);
423         SWITCH_RESULT.as_mut().unwrap().get_mut().next_pcb = Some(next);
424         // kdebug!("switch tss ok");
425         compiler_fence(Ordering::SeqCst);
426         // 正式切换上下文
427         switch_to_inner(prev_arch, next_arch);
428     }
429 
430     unsafe fn switch_gsbase(prev: &Arc<ProcessControlBlock>, next: &Arc<ProcessControlBlock>) {
431         asm!("swapgs", options(nostack, preserves_flags));
432         prev.arch_info().save_gsbase();
433         next.arch_info().restore_gsbase();
434         // 将下一个进程的kstack写入kernel_gsbase
435         next.arch_info().store_kernel_gsbase();
436         asm!("swapgs", options(nostack, preserves_flags));
437     }
438 }
439 
440 /// 保存上下文,然后切换进程,接着jmp到`switch_finish_hook`钩子函数
441 #[naked]
442 unsafe extern "sysv64" fn switch_to_inner(prev: *mut ArchPCBInfo, next: *mut ArchPCBInfo) {
443     asm!(
444         // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"):
445         //
446         // - the current parameters are passed in the registers `rdi`, `rsi`,
447         // - we can modify scratch registers, e.g. rax
448         // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we
449         //   store them here in the first place.
450         concat!("
451         // Save old registers, and load new ones
452         mov [rdi + {off_rbx}], rbx
453         mov rbx, [rsi + {off_rbx}]
454 
455         mov [rdi + {off_r12}], r12
456         mov r12, [rsi + {off_r12}]
457 
458         mov [rdi + {off_r13}], r13
459         mov r13, [rsi + {off_r13}]
460 
461         mov [rdi + {off_r14}], r14
462         mov r14, [rsi + {off_r14}]
463 
464         mov [rdi + {off_r15}], r15
465         mov r15, [rsi + {off_r15}]
466 
467         // switch segment registers (这些寄存器只能通过接下来的switch_hook的return来切换)
468         mov [rdi + {off_fs}], fs
469         mov [rdi + {off_gs}], gs
470 
471         // mov fs, [rsi + {off_fs}]
472         // mov gs, [rsi + {off_gs}]
473 
474         push rbp
475         push rax
476 
477         mov [rdi + {off_rbp}], rbp
478         mov rbp, [rsi + {off_rbp}]
479 
480         mov [rdi + {off_rsp}], rsp
481         mov rsp, [rsi + {off_rsp}]
482 
483         // // push RFLAGS (can only be modified via stack)
484         pushfq
485         // // pop RFLAGS into `self.rflags`
486         pop QWORD PTR [rdi + {off_rflags}]
487 
488         // // push `next.rflags`
489         push QWORD PTR [rsi + {off_rflags}]
490         // // pop into RFLAGS
491         popfq
492 
493         // push next rip to stack
494         push QWORD PTR [rsi + {off_rip}]
495 
496 
497         // When we return, we cannot even guarantee that the return address on the stack, points to
498         // the calling function. Thus, we have to execute this Rust hook by
499         // ourselves, which will unlock the contexts before the later switch.
500 
501         // Note that switch_finish_hook will be responsible for executing `ret`.
502         jmp {switch_hook}
503         "),
504 
505         off_rflags = const(offset_of!(ArchPCBInfo, rflags)),
506 
507         off_rbx = const(offset_of!(ArchPCBInfo, rbx)),
508         off_r12 = const(offset_of!(ArchPCBInfo, r12)),
509         off_r13 = const(offset_of!(ArchPCBInfo, r13)),
510         off_r14 = const(offset_of!(ArchPCBInfo, r14)),
511         off_rbp = const(offset_of!(ArchPCBInfo, rbp)),
512         off_rsp = const(offset_of!(ArchPCBInfo, rsp)),
513         off_r15 = const(offset_of!(ArchPCBInfo, r15)),
514         off_rip = const(offset_of!(ArchPCBInfo, rip)),
515         off_fs = const(offset_of!(ArchPCBInfo, fs)),
516         off_gs = const(offset_of!(ArchPCBInfo, gs)),
517 
518         switch_hook = sym crate::process::switch_finish_hook,
519         options(noreturn),
520     );
521 }
522 
523 /// 从`switch_to_inner`返回后,执行这个函数
524 ///
525 /// 也就是说,当进程再次被调度时,会从这里开始执行
526 #[inline(never)]
527 unsafe extern "sysv64" fn switch_back() {
528     asm!(concat!(
529         "
530         pop rax
531         pop rbp
532         "
533     ))
534 }
535 
536 pub unsafe fn arch_switch_to_user(path: String, argv: Vec<String>, envp: Vec<String>) -> ! {
537     // 以下代码不能发生中断
538     CurrentIrqArch::interrupt_disable();
539 
540     let current_pcb = ProcessManager::current_pcb();
541     let trap_frame_vaddr = VirtAddr::new(
542         current_pcb.kernel_stack().stack_max_address().data() - core::mem::size_of::<TrapFrame>(),
543     );
544     // kdebug!("trap_frame_vaddr: {:?}", trap_frame_vaddr);
545     let new_rip = VirtAddr::new(ret_from_intr as usize);
546 
547     assert!(
548         (x86::current::registers::rsp() as usize) < trap_frame_vaddr.data(),
549         "arch_switch_to_user(): current_rsp >= fake trap
550         frame vaddr, this may cause some illegal access to memory!
551         rsp: {:#x}, trap_frame_vaddr: {:#x}",
552         x86::current::registers::rsp() as usize,
553         trap_frame_vaddr.data()
554     );
555 
556     let mut arch_guard = current_pcb.arch_info_irqsave();
557     arch_guard.rsp = trap_frame_vaddr.data();
558 
559     arch_guard.fs = USER_DS;
560     arch_guard.gs = USER_DS;
561 
562     // 将内核gs数据压进cpu
563     arch_guard.store_kernel_gsbase();
564 
565     switch_fs_and_gs(
566         SegmentSelector::from_bits_truncate(arch_guard.fs.bits()),
567         SegmentSelector::from_bits_truncate(arch_guard.gs.bits()),
568     );
569     arch_guard.rip = new_rip.data();
570 
571     drop(arch_guard);
572 
573     // 删除kthread的标志
574     current_pcb.flags().remove(ProcessFlags::KTHREAD);
575     current_pcb.worker_private().take();
576 
577     let mut trap_frame = TrapFrame::new();
578 
579     compiler_fence(Ordering::SeqCst);
580     Syscall::do_execve(path, argv, envp, &mut trap_frame).unwrap_or_else(|e| {
581         panic!(
582             "arch_switch_to_user(): pid: {pid:?}, Failed to execve: , error: {e:?}",
583             pid = current_pcb.pid(),
584             e = e
585         );
586     });
587     compiler_fence(Ordering::SeqCst);
588 
589     // 重要!在这里之后,一定要保证上面的引用计数变量、动态申请的变量、锁的守卫都被drop了,否则可能导致内存安全问题!
590 
591     drop(current_pcb);
592 
593     compiler_fence(Ordering::SeqCst);
594     ready_to_switch_to_user(trap_frame, trap_frame_vaddr.data(), new_rip.data());
595 }
596 
597 /// 由于需要依赖ret来切换到用户态,所以不能inline
598 #[inline(never)]
599 unsafe extern "sysv64" fn ready_to_switch_to_user(
600     trap_frame: TrapFrame,
601     trapframe_vaddr: usize,
602     new_rip: usize,
603 ) -> ! {
604     *(trapframe_vaddr as *mut TrapFrame) = trap_frame;
605     asm!(
606         "swapgs",
607         "mov rsp, {trapframe_vaddr}",
608         "push {new_rip}",
609         "ret",
610         trapframe_vaddr = in(reg) trapframe_vaddr,
611         new_rip = in(reg) new_rip
612     );
613     unreachable!()
614 }
615