xref: /DragonOS/kernel/src/arch/x86_64/process/mod.rs (revision dfe53cf087ef4c7b6db63d992906b062dc63e93f)
1 use core::{
2     arch::asm,
3     intrinsics::unlikely,
4     mem::ManuallyDrop,
5     sync::atomic::{compiler_fence, Ordering},
6 };
7 
8 use alloc::{
9     string::String,
10     sync::{Arc, Weak},
11     vec::Vec,
12 };
13 
14 use kdepends::memoffset::offset_of;
15 use system_error::SystemError;
16 use x86::{controlregs::Cr4, segmentation::SegmentSelector};
17 
18 use crate::{
19     arch::process::table::TSSManager,
20     exception::InterruptArch,
21     kerror, kwarn,
22     libs::spinlock::SpinLockGuard,
23     mm::VirtAddr,
24     process::{
25         fork::{CloneFlags, KernelCloneArgs},
26         KernelStack, ProcessControlBlock, ProcessFlags, ProcessManager, PROCESS_SWITCH_RESULT,
27     },
28     syscall::Syscall,
29 };
30 
31 use self::{
32     kthread::kernel_thread_bootstrap_stage1,
33     syscall::ARCH_SET_FS,
34     table::{switch_fs_and_gs, KERNEL_DS, USER_DS},
35 };
36 
37 use super::{fpu::FpState, interrupt::TrapFrame, syscall::X86_64GSData, CurrentIrqArch};
38 
39 pub mod idle;
40 pub mod kthread;
41 pub mod syscall;
42 pub mod table;
43 
44 extern "C" {
45     /// 从中断返回
46     fn ret_from_intr();
47 }
48 
49 #[allow(dead_code)]
50 #[repr(align(32768))]
51 union InitProcUnion {
52     /// 用于存放idle进程的内核栈
53     idle_stack: [u8; 32768],
54 }
55 
56 #[link_section = ".data.init_proc_union"]
57 #[no_mangle]
58 static BSP_IDLE_STACK_SPACE: InitProcUnion = InitProcUnion {
59     idle_stack: [0; 32768],
60 };
61 
62 /// PCB中与架构相关的信息
63 #[derive(Debug)]
64 #[allow(dead_code)]
65 pub struct ArchPCBInfo {
66     rflags: usize,
67     rbx: usize,
68     r12: usize,
69     r13: usize,
70     r14: usize,
71     r15: usize,
72     rbp: usize,
73     rsp: usize,
74     rip: usize,
75     cr2: usize,
76     fsbase: usize,
77     gsbase: usize,
78     fs: SegmentSelector,
79     gs: SegmentSelector,
80     /// 存储PCB系统调用栈以及在syscall过程中暂存用户态rsp的结构体
81     gsdata: X86_64GSData,
82     /// 浮点寄存器的状态
83     fp_state: Option<FpState>,
84 }
85 
86 #[allow(dead_code)]
87 impl ArchPCBInfo {
88     /// 创建一个新的ArchPCBInfo
89     ///
90     /// ## 参数
91     ///
92     /// - `kstack`:内核栈的引用,如果为None,则不会设置rsp和rbp。如果为Some,则会设置rsp和rbp为内核栈的最高地址。
93     ///
94     /// ## 返回值
95     ///
96     /// 返回一个新的ArchPCBInfo
97     #[inline(never)]
98     pub fn new(kstack: &KernelStack) -> Self {
99         let mut r = Self {
100             rflags: 0,
101             rbx: 0,
102             r12: 0,
103             r13: 0,
104             r14: 0,
105             r15: 0,
106             rbp: 0,
107             rsp: 0,
108             rip: 0,
109             cr2: 0,
110             fsbase: 0,
111             gsbase: 0,
112             gsdata: X86_64GSData {
113                 kaddr: VirtAddr::new(0),
114                 uaddr: VirtAddr::new(0),
115             },
116             fs: KERNEL_DS,
117             gs: KERNEL_DS,
118             fp_state: None,
119         };
120 
121         r.rsp = kstack.stack_max_address().data() - 8;
122         r.rbp = kstack.stack_max_address().data();
123 
124         return r;
125     }
126 
127     pub fn set_stack(&mut self, stack: VirtAddr) {
128         self.rsp = stack.data();
129     }
130 
131     pub fn set_stack_base(&mut self, stack_base: VirtAddr) {
132         self.rbp = stack_base.data();
133     }
134 
135     pub fn rbp(&self) -> usize {
136         self.rbp
137     }
138 
139     pub unsafe fn push_to_stack(&mut self, value: usize) {
140         self.rsp -= core::mem::size_of::<usize>();
141         *(self.rsp as *mut usize) = value;
142     }
143 
144     pub unsafe fn pop_from_stack(&mut self) -> usize {
145         let value = *(self.rsp as *const usize);
146         self.rsp += core::mem::size_of::<usize>();
147         value
148     }
149 
150     pub fn save_fp_state(&mut self) {
151         if self.fp_state.is_none() {
152             self.fp_state = Some(FpState::new());
153         }
154 
155         self.fp_state.as_mut().unwrap().save();
156     }
157 
158     pub fn restore_fp_state(&mut self) {
159         if unlikely(self.fp_state.is_none()) {
160             return;
161         }
162 
163         self.fp_state.as_mut().unwrap().restore();
164     }
165 
166     /// 返回浮点寄存器结构体的副本
167     pub fn fp_state(&self) -> &Option<FpState> {
168         &self.fp_state
169     }
170 
171     // 清空浮点寄存器
172     pub fn clear_fp_state(&mut self) {
173         if unlikely(self.fp_state.is_none()) {
174             kwarn!("fp_state is none");
175             return;
176         }
177 
178         self.fp_state.as_mut().unwrap().clear();
179     }
180     pub unsafe fn save_fsbase(&mut self) {
181         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
182             self.fsbase = x86::current::segmentation::rdfsbase() as usize;
183         } else {
184             self.fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE) as usize;
185         }
186     }
187 
188     pub unsafe fn save_gsbase(&mut self) {
189         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
190             self.gsbase = x86::current::segmentation::rdgsbase() as usize;
191         } else {
192             self.gsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE) as usize;
193         }
194     }
195 
196     pub unsafe fn restore_fsbase(&mut self) {
197         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
198             x86::current::segmentation::wrfsbase(self.fsbase as u64);
199         } else {
200             x86::msr::wrmsr(x86::msr::IA32_FS_BASE, self.fsbase as u64);
201         }
202     }
203 
204     pub unsafe fn restore_gsbase(&mut self) {
205         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
206             x86::current::segmentation::wrgsbase(self.gsbase as u64);
207         } else {
208             x86::msr::wrmsr(x86::msr::IA32_GS_BASE, self.gsbase as u64);
209         }
210     }
211 
212     /// 将gsdata写入KernelGsbase寄存器
213     pub unsafe fn store_kernel_gsbase(&self) {
214         x86::msr::wrmsr(
215             x86::msr::IA32_KERNEL_GSBASE,
216             &self.gsdata as *const X86_64GSData as u64,
217         );
218     }
219 
220     /// ### 初始化系统调用栈,不得与PCB内核栈冲突(即传入的应该是一个新的栈,避免栈损坏)
221     pub fn init_syscall_stack(&mut self, stack: &KernelStack) {
222         self.gsdata.set_kstack(stack.stack_max_address() - 8);
223     }
224 
225     pub fn fsbase(&self) -> usize {
226         self.fsbase
227     }
228 
229     pub fn gsbase(&self) -> usize {
230         self.gsbase
231     }
232 
233     pub fn cr2_mut(&mut self) -> &mut usize {
234         &mut self.cr2
235     }
236 
237     pub fn fp_state_mut(&mut self) -> &mut Option<FpState> {
238         &mut self.fp_state
239     }
240 
241     /// ### 克隆ArchPCBInfo,需要注意gsdata也是对应clone的
242     pub fn clone_all(&self) -> Self {
243         Self {
244             rflags: self.rflags,
245             rbx: self.rbx,
246             r12: self.r12,
247             r13: self.r13,
248             r14: self.r14,
249             r15: self.r15,
250             rbp: self.rbp,
251             rsp: self.rsp,
252             rip: self.rip,
253             cr2: self.cr2,
254             fsbase: self.fsbase,
255             gsbase: self.gsbase,
256             fs: self.fs,
257             gs: self.gs,
258             gsdata: self.gsdata.clone(),
259             fp_state: self.fp_state,
260         }
261     }
262 
263     // ### 从另一个ArchPCBInfo处clone,gsdata会被保留
264     pub fn clone_from(&mut self, from: &Self) {
265         let gsdata = self.gsdata.clone();
266         *self = from.clone_all();
267         self.gsdata = gsdata;
268     }
269 }
270 
271 impl ProcessControlBlock {
272     /// 获取当前进程的pcb
273     pub fn arch_current_pcb() -> Arc<Self> {
274         // 获取栈指针
275         let ptr = VirtAddr::new(x86::current::registers::rsp() as usize);
276 
277         let stack_base = VirtAddr::new(ptr.data() & (!(KernelStack::ALIGN - 1)));
278 
279         // 从内核栈的最低地址处取出pcb的地址
280         let p = stack_base.data() as *const *const ProcessControlBlock;
281         if unlikely((unsafe { *p }).is_null()) {
282             kerror!("p={:p}", p);
283             panic!("current_pcb is null");
284         }
285         unsafe {
286             // 为了防止内核栈的pcb weak 指针被释放,这里需要将其包装一下
287             let weak_wrapper: ManuallyDrop<Weak<ProcessControlBlock>> =
288                 ManuallyDrop::new(Weak::from_raw(*p));
289 
290             let new_arc: Arc<ProcessControlBlock> = weak_wrapper.upgrade().unwrap();
291             return new_arc;
292         }
293     }
294 }
295 
296 impl ProcessManager {
297     pub fn arch_init() {
298         // do nothing
299     }
300     /// fork的过程中复制线程
301     ///
302     /// 由于这个过程与具体的架构相关,所以放在这里
303     pub fn copy_thread(
304         current_pcb: &Arc<ProcessControlBlock>,
305         new_pcb: &Arc<ProcessControlBlock>,
306         clone_args: KernelCloneArgs,
307         current_trapframe: &TrapFrame,
308     ) -> Result<(), SystemError> {
309         let clone_flags = clone_args.flags;
310         let mut child_trapframe = *current_trapframe;
311 
312         // 子进程的返回值为0
313         child_trapframe.set_return_value(0);
314 
315         // 设置子进程的栈基址(开始执行中断返回流程时的栈基址)
316         let mut new_arch_guard = unsafe { new_pcb.arch_info() };
317         let kernel_stack_guard = new_pcb.kernel_stack();
318 
319         // 设置子进程在内核态开始执行时的rsp、rbp
320         new_arch_guard.set_stack_base(kernel_stack_guard.stack_max_address());
321 
322         let trap_frame_vaddr: VirtAddr =
323             kernel_stack_guard.stack_max_address() - core::mem::size_of::<TrapFrame>();
324         new_arch_guard.set_stack(trap_frame_vaddr);
325 
326         // 拷贝栈帧
327         unsafe {
328             let usp = clone_args.stack;
329             if usp != 0 {
330                 child_trapframe.rsp = usp as u64;
331             }
332             let trap_frame_ptr = trap_frame_vaddr.data() as *mut TrapFrame;
333             *trap_frame_ptr = child_trapframe;
334         }
335 
336         let current_arch_guard = current_pcb.arch_info_irqsave();
337         new_arch_guard.fsbase = current_arch_guard.fsbase;
338         new_arch_guard.gsbase = current_arch_guard.gsbase;
339         new_arch_guard.fs = current_arch_guard.fs;
340         new_arch_guard.gs = current_arch_guard.gs;
341         new_arch_guard.fp_state = current_arch_guard.fp_state;
342 
343         // 拷贝浮点寄存器的状态
344         if let Some(fp_state) = current_arch_guard.fp_state.as_ref() {
345             new_arch_guard.fp_state = Some(*fp_state);
346         }
347         drop(current_arch_guard);
348 
349         // 设置返回地址(子进程开始执行的指令地址)
350         if new_pcb.flags().contains(ProcessFlags::KTHREAD) {
351             let kthread_bootstrap_stage1_func_addr = kernel_thread_bootstrap_stage1 as usize;
352             new_arch_guard.rip = kthread_bootstrap_stage1_func_addr;
353         } else {
354             new_arch_guard.rip = ret_from_intr as usize;
355         }
356 
357         // 设置tls
358         if clone_flags.contains(CloneFlags::CLONE_SETTLS) {
359             drop(new_arch_guard);
360             Syscall::do_arch_prctl_64(new_pcb, ARCH_SET_FS, clone_args.tls, true)?;
361         }
362 
363         return Ok(());
364     }
365 
366     /// 切换进程
367     ///
368     /// ## 参数
369     ///
370     /// - `prev`:上一个进程的pcb
371     /// - `next`:下一个进程的pcb
372     pub unsafe fn switch_process(prev: Arc<ProcessControlBlock>, next: Arc<ProcessControlBlock>) {
373         assert!(!CurrentIrqArch::is_irq_enabled());
374 
375         // 保存浮点寄存器
376         prev.arch_info_irqsave().save_fp_state();
377         // 切换浮点寄存器
378         next.arch_info_irqsave().restore_fp_state();
379 
380         // 切换fsbase
381         prev.arch_info_irqsave().save_fsbase();
382         next.arch_info_irqsave().restore_fsbase();
383 
384         // 切换gsbase
385         Self::switch_gsbase(&prev, &next);
386 
387         // 切换地址空间
388         let next_addr_space = next.basic().user_vm().as_ref().unwrap().clone();
389         compiler_fence(Ordering::SeqCst);
390 
391         next_addr_space.read().user_mapper.utable.make_current();
392         drop(next_addr_space);
393         compiler_fence(Ordering::SeqCst);
394         // 切换内核栈
395 
396         // 获取arch info的锁,并强制泄露其守卫(切换上下文后,在switch_finish_hook中会释放锁)
397         let next_arch = SpinLockGuard::leak(next.arch_info_irqsave()) as *mut ArchPCBInfo;
398         let prev_arch = SpinLockGuard::leak(prev.arch_info_irqsave()) as *mut ArchPCBInfo;
399 
400         (*prev_arch).rip = switch_back as usize;
401 
402         // 恢复当前的 preempt count*2
403         ProcessManager::current_pcb().preempt_enable();
404         ProcessManager::current_pcb().preempt_enable();
405 
406         // 切换tss
407         TSSManager::current_tss().set_rsp(
408             x86::Ring::Ring0,
409             next.kernel_stack().stack_max_address().data() as u64,
410         );
411         PROCESS_SWITCH_RESULT.as_mut().unwrap().get_mut().prev_pcb = Some(prev);
412         PROCESS_SWITCH_RESULT.as_mut().unwrap().get_mut().next_pcb = Some(next);
413         // kdebug!("switch tss ok");
414         compiler_fence(Ordering::SeqCst);
415         // 正式切换上下文
416         switch_to_inner(prev_arch, next_arch);
417     }
418 
419     unsafe fn switch_gsbase(prev: &Arc<ProcessControlBlock>, next: &Arc<ProcessControlBlock>) {
420         asm!("swapgs", options(nostack, preserves_flags));
421         prev.arch_info_irqsave().save_gsbase();
422         next.arch_info_irqsave().restore_gsbase();
423         // 将下一个进程的kstack写入kernel_gsbase
424         next.arch_info_irqsave().store_kernel_gsbase();
425         asm!("swapgs", options(nostack, preserves_flags));
426     }
427 }
428 
429 /// 保存上下文,然后切换进程,接着jmp到`switch_finish_hook`钩子函数
430 #[naked]
431 unsafe extern "sysv64" fn switch_to_inner(prev: *mut ArchPCBInfo, next: *mut ArchPCBInfo) {
432     asm!(
433         // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"):
434         //
435         // - the current parameters are passed in the registers `rdi`, `rsi`,
436         // - we can modify scratch registers, e.g. rax
437         // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we
438         //   store them here in the first place.
439         concat!("
440         // Save old registers, and load new ones
441         mov [rdi + {off_rbx}], rbx
442         mov rbx, [rsi + {off_rbx}]
443 
444         mov [rdi + {off_r12}], r12
445         mov r12, [rsi + {off_r12}]
446 
447         mov [rdi + {off_r13}], r13
448         mov r13, [rsi + {off_r13}]
449 
450         mov [rdi + {off_r14}], r14
451         mov r14, [rsi + {off_r14}]
452 
453         mov [rdi + {off_r15}], r15
454         mov r15, [rsi + {off_r15}]
455 
456         // switch segment registers (这些寄存器只能通过接下来的switch_hook的return来切换)
457         mov [rdi + {off_fs}], fs
458         mov [rdi + {off_gs}], gs
459 
460         // mov fs, [rsi + {off_fs}]
461         // mov gs, [rsi + {off_gs}]
462 
463         push rbp
464         push rax
465 
466         mov [rdi + {off_rbp}], rbp
467         mov rbp, [rsi + {off_rbp}]
468 
469         mov [rdi + {off_rsp}], rsp
470         mov rsp, [rsi + {off_rsp}]
471 
472         // // push RFLAGS (can only be modified via stack)
473         pushfq
474         // // pop RFLAGS into `self.rflags`
475         pop QWORD PTR [rdi + {off_rflags}]
476 
477         // // push `next.rflags`
478         push QWORD PTR [rsi + {off_rflags}]
479         // // pop into RFLAGS
480         popfq
481 
482         // push next rip to stack
483         push QWORD PTR [rsi + {off_rip}]
484 
485 
486         // When we return, we cannot even guarantee that the return address on the stack, points to
487         // the calling function. Thus, we have to execute this Rust hook by
488         // ourselves, which will unlock the contexts before the later switch.
489 
490         // Note that switch_finish_hook will be responsible for executing `ret`.
491         jmp {switch_hook}
492         "),
493 
494         off_rflags = const(offset_of!(ArchPCBInfo, rflags)),
495 
496         off_rbx = const(offset_of!(ArchPCBInfo, rbx)),
497         off_r12 = const(offset_of!(ArchPCBInfo, r12)),
498         off_r13 = const(offset_of!(ArchPCBInfo, r13)),
499         off_r14 = const(offset_of!(ArchPCBInfo, r14)),
500         off_rbp = const(offset_of!(ArchPCBInfo, rbp)),
501         off_rsp = const(offset_of!(ArchPCBInfo, rsp)),
502         off_r15 = const(offset_of!(ArchPCBInfo, r15)),
503         off_rip = const(offset_of!(ArchPCBInfo, rip)),
504         off_fs = const(offset_of!(ArchPCBInfo, fs)),
505         off_gs = const(offset_of!(ArchPCBInfo, gs)),
506 
507         switch_hook = sym crate::process::switch_finish_hook,
508         options(noreturn),
509     );
510 }
511 
512 /// 从`switch_to_inner`返回后,执行这个函数
513 ///
514 /// 也就是说,当进程再次被调度时,会从这里开始执行
515 #[inline(never)]
516 unsafe extern "sysv64" fn switch_back() {
517     asm!(concat!(
518         "
519         pop rax
520         pop rbp
521         "
522     ))
523 }
524 
525 pub unsafe fn arch_switch_to_user(path: String, argv: Vec<String>, envp: Vec<String>) -> ! {
526     // 以下代码不能发生中断
527     CurrentIrqArch::interrupt_disable();
528 
529     let current_pcb = ProcessManager::current_pcb();
530     let trap_frame_vaddr = VirtAddr::new(
531         current_pcb.kernel_stack().stack_max_address().data() - core::mem::size_of::<TrapFrame>(),
532     );
533     // kdebug!("trap_frame_vaddr: {:?}", trap_frame_vaddr);
534     let new_rip = VirtAddr::new(ret_from_intr as usize);
535 
536     assert!(
537         (x86::current::registers::rsp() as usize) < trap_frame_vaddr.data(),
538         "arch_switch_to_user(): current_rsp >= fake trap
539         frame vaddr, this may cause some illegal access to memory!
540         rsp: {:#x}, trap_frame_vaddr: {:#x}",
541         x86::current::registers::rsp() as usize,
542         trap_frame_vaddr.data()
543     );
544 
545     let mut arch_guard = current_pcb.arch_info_irqsave();
546     arch_guard.rsp = trap_frame_vaddr.data();
547 
548     arch_guard.fs = USER_DS;
549     arch_guard.gs = USER_DS;
550 
551     // 将内核gs数据压进cpu
552     arch_guard.store_kernel_gsbase();
553 
554     switch_fs_and_gs(
555         SegmentSelector::from_bits_truncate(arch_guard.fs.bits()),
556         SegmentSelector::from_bits_truncate(arch_guard.gs.bits()),
557     );
558     arch_guard.rip = new_rip.data();
559 
560     drop(arch_guard);
561 
562     // 删除kthread的标志
563     current_pcb.flags().remove(ProcessFlags::KTHREAD);
564     current_pcb.worker_private().take();
565 
566     let mut trap_frame = TrapFrame::new();
567 
568     compiler_fence(Ordering::SeqCst);
569     Syscall::do_execve(path, argv, envp, &mut trap_frame).unwrap_or_else(|e| {
570         panic!(
571             "arch_switch_to_user(): pid: {pid:?}, Failed to execve: , error: {e:?}",
572             pid = current_pcb.pid(),
573             e = e
574         );
575     });
576     compiler_fence(Ordering::SeqCst);
577 
578     // 重要!在这里之后,一定要保证上面的引用计数变量、动态申请的变量、锁的守卫都被drop了,否则可能导致内存安全问题!
579 
580     drop(current_pcb);
581 
582     compiler_fence(Ordering::SeqCst);
583     ready_to_switch_to_user(trap_frame, trap_frame_vaddr.data(), new_rip.data());
584 }
585 
586 /// 由于需要依赖ret来切换到用户态,所以不能inline
587 #[inline(never)]
588 unsafe extern "sysv64" fn ready_to_switch_to_user(
589     trap_frame: TrapFrame,
590     trapframe_vaddr: usize,
591     new_rip: usize,
592 ) -> ! {
593     *(trapframe_vaddr as *mut TrapFrame) = trap_frame;
594     asm!(
595         "swapgs",
596         "mov rsp, {trapframe_vaddr}",
597         "push {new_rip}",
598         "ret",
599         trapframe_vaddr = in(reg) trapframe_vaddr,
600         new_rip = in(reg) new_rip
601     );
602     unreachable!()
603 }
604