xref: /DragonOS/kernel/src/arch/x86_64/process/mod.rs (revision 1ea2daad8121b77ed704e6d7c3a09f478147441d)
1 use core::{
2     arch::asm,
3     intrinsics::unlikely,
4     mem::ManuallyDrop,
5     sync::atomic::{compiler_fence, Ordering},
6 };
7 
8 use alloc::sync::{Arc, Weak};
9 
10 use kdepends::memoffset::offset_of;
11 use log::{error, warn};
12 use system_error::SystemError;
13 use x86::{controlregs::Cr4, segmentation::SegmentSelector};
14 
15 use crate::{
16     arch::process::table::TSSManager,
17     exception::InterruptArch,
18     libs::spinlock::SpinLockGuard,
19     mm::VirtAddr,
20     process::{
21         fork::{CloneFlags, KernelCloneArgs},
22         KernelStack, ProcessControlBlock, ProcessFlags, ProcessManager, PROCESS_SWITCH_RESULT,
23     },
24     syscall::Syscall,
25 };
26 
27 use self::{
28     kthread::kernel_thread_bootstrap_stage1,
29     syscall::ARCH_SET_FS,
30     table::{switch_fs_and_gs, KERNEL_DS, USER_DS},
31 };
32 
33 use super::{fpu::FpState, interrupt::TrapFrame, syscall::X86_64GSData, CurrentIrqArch};
34 
35 pub mod idle;
36 pub mod kthread;
37 pub mod syscall;
38 pub mod table;
39 
40 extern "C" {
41     /// 从中断返回
42     fn ret_from_intr();
43 }
44 
45 #[allow(dead_code)]
46 #[repr(align(32768))]
47 union InitProcUnion {
48     /// 用于存放idle进程的内核栈
49     idle_stack: [u8; 32768],
50 }
51 
52 #[link_section = ".data.init_proc_union"]
53 #[no_mangle]
54 static BSP_IDLE_STACK_SPACE: InitProcUnion = InitProcUnion {
55     idle_stack: [0; 32768],
56 };
57 
58 /// PCB中与架构相关的信息
59 #[derive(Debug)]
60 #[allow(dead_code)]
61 pub struct ArchPCBInfo {
62     rflags: usize,
63     rbx: usize,
64     r12: usize,
65     r13: usize,
66     r14: usize,
67     r15: usize,
68     rbp: usize,
69     rsp: usize,
70     rip: usize,
71     cr2: usize,
72     fsbase: usize,
73     gsbase: usize,
74     fs: SegmentSelector,
75     gs: SegmentSelector,
76     /// 存储PCB系统调用栈以及在syscall过程中暂存用户态rsp的结构体
77     gsdata: X86_64GSData,
78     /// 浮点寄存器的状态
79     fp_state: Option<FpState>,
80 }
81 
82 #[allow(dead_code)]
83 impl ArchPCBInfo {
84     /// 创建一个新的ArchPCBInfo
85     ///
86     /// ## 参数
87     ///
88     /// - `kstack`:内核栈的引用,如果为None,则不会设置rsp和rbp。如果为Some,则会设置rsp和rbp为内核栈的最高地址。
89     ///
90     /// ## 返回值
91     ///
92     /// 返回一个新的ArchPCBInfo
93     #[inline(never)]
94     pub fn new(kstack: &KernelStack) -> Self {
95         let mut r = Self {
96             rflags: 0,
97             rbx: 0,
98             r12: 0,
99             r13: 0,
100             r14: 0,
101             r15: 0,
102             rbp: 0,
103             rsp: 0,
104             rip: 0,
105             cr2: 0,
106             fsbase: 0,
107             gsbase: 0,
108             gsdata: X86_64GSData {
109                 kaddr: VirtAddr::new(0),
110                 uaddr: VirtAddr::new(0),
111             },
112             fs: KERNEL_DS,
113             gs: KERNEL_DS,
114             fp_state: None,
115         };
116 
117         r.rsp = kstack.stack_max_address().data() - 8;
118         r.rbp = kstack.stack_max_address().data();
119 
120         return r;
121     }
122 
123     pub fn set_stack(&mut self, stack: VirtAddr) {
124         self.rsp = stack.data();
125     }
126 
127     pub fn set_stack_base(&mut self, stack_base: VirtAddr) {
128         self.rbp = stack_base.data();
129     }
130 
131     pub fn rbp(&self) -> usize {
132         self.rbp
133     }
134 
135     pub unsafe fn push_to_stack(&mut self, value: usize) {
136         self.rsp -= core::mem::size_of::<usize>();
137         *(self.rsp as *mut usize) = value;
138     }
139 
140     pub unsafe fn pop_from_stack(&mut self) -> usize {
141         let value = *(self.rsp as *const usize);
142         self.rsp += core::mem::size_of::<usize>();
143         value
144     }
145 
146     pub fn save_fp_state(&mut self) {
147         if self.fp_state.is_none() {
148             self.fp_state = Some(FpState::new());
149         }
150 
151         self.fp_state.as_mut().unwrap().save();
152     }
153 
154     pub fn restore_fp_state(&mut self) {
155         if unlikely(self.fp_state.is_none()) {
156             return;
157         }
158 
159         self.fp_state.as_mut().unwrap().restore();
160     }
161 
162     /// 返回浮点寄存器结构体的副本
163     pub fn fp_state(&self) -> &Option<FpState> {
164         &self.fp_state
165     }
166 
167     // 清空浮点寄存器
168     pub fn clear_fp_state(&mut self) {
169         if unlikely(self.fp_state.is_none()) {
170             warn!("fp_state is none");
171             return;
172         }
173 
174         self.fp_state.as_mut().unwrap().clear();
175     }
176     pub unsafe fn save_fsbase(&mut self) {
177         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
178             self.fsbase = x86::current::segmentation::rdfsbase() as usize;
179         } else {
180             self.fsbase = x86::msr::rdmsr(x86::msr::IA32_FS_BASE) as usize;
181         }
182     }
183 
184     pub unsafe fn save_gsbase(&mut self) {
185         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
186             self.gsbase = x86::current::segmentation::rdgsbase() as usize;
187         } else {
188             self.gsbase = x86::msr::rdmsr(x86::msr::IA32_GS_BASE) as usize;
189         }
190     }
191 
192     pub unsafe fn restore_fsbase(&mut self) {
193         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
194             x86::current::segmentation::wrfsbase(self.fsbase as u64);
195         } else {
196             x86::msr::wrmsr(x86::msr::IA32_FS_BASE, self.fsbase as u64);
197         }
198     }
199 
200     pub unsafe fn restore_gsbase(&mut self) {
201         if x86::controlregs::cr4().contains(Cr4::CR4_ENABLE_FSGSBASE) {
202             x86::current::segmentation::wrgsbase(self.gsbase as u64);
203         } else {
204             x86::msr::wrmsr(x86::msr::IA32_GS_BASE, self.gsbase as u64);
205         }
206     }
207 
208     /// 将gsdata写入KernelGsbase寄存器
209     pub unsafe fn store_kernel_gsbase(&self) {
210         x86::msr::wrmsr(
211             x86::msr::IA32_KERNEL_GSBASE,
212             &self.gsdata as *const X86_64GSData as u64,
213         );
214     }
215 
216     /// ### 初始化系统调用栈,不得与PCB内核栈冲突(即传入的应该是一个新的栈,避免栈损坏)
217     pub fn init_syscall_stack(&mut self, stack: &KernelStack) {
218         self.gsdata.set_kstack(stack.stack_max_address() - 8);
219     }
220 
221     pub fn fsbase(&self) -> usize {
222         self.fsbase
223     }
224 
225     pub fn gsbase(&self) -> usize {
226         self.gsbase
227     }
228 
229     pub fn cr2_mut(&mut self) -> &mut usize {
230         &mut self.cr2
231     }
232 
233     pub fn fp_state_mut(&mut self) -> &mut Option<FpState> {
234         &mut self.fp_state
235     }
236 
237     /// ### 克隆ArchPCBInfo,需要注意gsdata也是对应clone的
238     pub fn clone_all(&self) -> Self {
239         Self {
240             rflags: self.rflags,
241             rbx: self.rbx,
242             r12: self.r12,
243             r13: self.r13,
244             r14: self.r14,
245             r15: self.r15,
246             rbp: self.rbp,
247             rsp: self.rsp,
248             rip: self.rip,
249             cr2: self.cr2,
250             fsbase: self.fsbase,
251             gsbase: self.gsbase,
252             fs: self.fs,
253             gs: self.gs,
254             gsdata: self.gsdata.clone(),
255             fp_state: self.fp_state,
256         }
257     }
258 
259     // ### 从另一个ArchPCBInfo处clone,gsdata会被保留
260     pub fn clone_from(&mut self, from: &Self) {
261         let gsdata = self.gsdata.clone();
262         *self = from.clone_all();
263         self.gsdata = gsdata;
264     }
265 }
266 
267 impl ProcessControlBlock {
268     /// 获取当前进程的pcb
269     pub fn arch_current_pcb() -> Arc<Self> {
270         // 获取栈指针
271         let ptr = VirtAddr::new(x86::current::registers::rsp() as usize);
272 
273         let stack_base = VirtAddr::new(ptr.data() & (!(KernelStack::ALIGN - 1)));
274 
275         // 从内核栈的最低地址处取出pcb的地址
276         let p = stack_base.data() as *const *const ProcessControlBlock;
277         if unlikely((unsafe { *p }).is_null()) {
278             error!("p={:p}", p);
279             panic!("current_pcb is null");
280         }
281         unsafe {
282             // 为了防止内核栈的pcb weak 指针被释放,这里需要将其包装一下
283             let weak_wrapper: ManuallyDrop<Weak<ProcessControlBlock>> =
284                 ManuallyDrop::new(Weak::from_raw(*p));
285 
286             let new_arc: Arc<ProcessControlBlock> = weak_wrapper.upgrade().unwrap();
287             return new_arc;
288         }
289     }
290 }
291 
292 impl ProcessManager {
293     pub fn arch_init() {
294         // do nothing
295     }
296     /// fork的过程中复制线程
297     ///
298     /// 由于这个过程与具体的架构相关,所以放在这里
299     pub fn copy_thread(
300         current_pcb: &Arc<ProcessControlBlock>,
301         new_pcb: &Arc<ProcessControlBlock>,
302         clone_args: KernelCloneArgs,
303         current_trapframe: &TrapFrame,
304     ) -> Result<(), SystemError> {
305         let clone_flags = clone_args.flags;
306         let mut child_trapframe = *current_trapframe;
307 
308         // 子进程的返回值为0
309         child_trapframe.set_return_value(0);
310 
311         // 设置子进程的栈基址(开始执行中断返回流程时的栈基址)
312         let mut new_arch_guard = unsafe { new_pcb.arch_info() };
313         let kernel_stack_guard = new_pcb.kernel_stack();
314 
315         // 设置子进程在内核态开始执行时的rsp、rbp
316         new_arch_guard.set_stack_base(kernel_stack_guard.stack_max_address());
317 
318         let trap_frame_vaddr: VirtAddr =
319             kernel_stack_guard.stack_max_address() - core::mem::size_of::<TrapFrame>();
320         new_arch_guard.set_stack(trap_frame_vaddr);
321 
322         // 拷贝栈帧
323         unsafe {
324             let usp = clone_args.stack;
325             if usp != 0 {
326                 child_trapframe.rsp = usp as u64;
327             }
328             let trap_frame_ptr = trap_frame_vaddr.data() as *mut TrapFrame;
329             *trap_frame_ptr = child_trapframe;
330         }
331 
332         let current_arch_guard = current_pcb.arch_info_irqsave();
333         new_arch_guard.fsbase = current_arch_guard.fsbase;
334         new_arch_guard.gsbase = current_arch_guard.gsbase;
335         new_arch_guard.fs = current_arch_guard.fs;
336         new_arch_guard.gs = current_arch_guard.gs;
337         new_arch_guard.fp_state = current_arch_guard.fp_state;
338 
339         // 拷贝浮点寄存器的状态
340         if let Some(fp_state) = current_arch_guard.fp_state.as_ref() {
341             new_arch_guard.fp_state = Some(*fp_state);
342         }
343         drop(current_arch_guard);
344 
345         // 设置返回地址(子进程开始执行的指令地址)
346         if new_pcb.flags().contains(ProcessFlags::KTHREAD) {
347             let kthread_bootstrap_stage1_func_addr = kernel_thread_bootstrap_stage1 as usize;
348             new_arch_guard.rip = kthread_bootstrap_stage1_func_addr;
349         } else {
350             new_arch_guard.rip = ret_from_intr as usize;
351         }
352 
353         // 设置tls
354         if clone_flags.contains(CloneFlags::CLONE_SETTLS) {
355             drop(new_arch_guard);
356             Syscall::do_arch_prctl_64(new_pcb, ARCH_SET_FS, clone_args.tls, true)?;
357         }
358 
359         return Ok(());
360     }
361 
362     /// 切换进程
363     ///
364     /// ## 参数
365     ///
366     /// - `prev`:上一个进程的pcb
367     /// - `next`:下一个进程的pcb
368     pub unsafe fn switch_process(prev: Arc<ProcessControlBlock>, next: Arc<ProcessControlBlock>) {
369         assert!(!CurrentIrqArch::is_irq_enabled());
370 
371         // 保存浮点寄存器
372         prev.arch_info_irqsave().save_fp_state();
373         // 切换浮点寄存器
374         next.arch_info_irqsave().restore_fp_state();
375 
376         // 切换fsbase
377         prev.arch_info_irqsave().save_fsbase();
378         next.arch_info_irqsave().restore_fsbase();
379 
380         // 切换gsbase
381         Self::switch_gsbase(&prev, &next);
382 
383         // 切换地址空间
384         let next_addr_space = next.basic().user_vm().as_ref().unwrap().clone();
385         compiler_fence(Ordering::SeqCst);
386 
387         next_addr_space.read().user_mapper.utable.make_current();
388         drop(next_addr_space);
389         compiler_fence(Ordering::SeqCst);
390         // 切换内核栈
391 
392         // 获取arch info的锁,并强制泄露其守卫(切换上下文后,在switch_finish_hook中会释放锁)
393         let next_arch = SpinLockGuard::leak(next.arch_info_irqsave()) as *mut ArchPCBInfo;
394         let prev_arch = SpinLockGuard::leak(prev.arch_info_irqsave()) as *mut ArchPCBInfo;
395 
396         (*prev_arch).rip = switch_back as usize;
397 
398         // 恢复当前的 preempt count*2
399         ProcessManager::current_pcb().preempt_enable();
400         ProcessManager::current_pcb().preempt_enable();
401 
402         // 切换tss
403         TSSManager::current_tss().set_rsp(
404             x86::Ring::Ring0,
405             next.kernel_stack().stack_max_address().data() as u64,
406         );
407         PROCESS_SWITCH_RESULT.as_mut().unwrap().get_mut().prev_pcb = Some(prev);
408         PROCESS_SWITCH_RESULT.as_mut().unwrap().get_mut().next_pcb = Some(next);
409         // debug!("switch tss ok");
410         compiler_fence(Ordering::SeqCst);
411         // 正式切换上下文
412         switch_to_inner(prev_arch, next_arch);
413     }
414 
415     unsafe fn switch_gsbase(prev: &Arc<ProcessControlBlock>, next: &Arc<ProcessControlBlock>) {
416         asm!("swapgs", options(nostack, preserves_flags));
417         prev.arch_info_irqsave().save_gsbase();
418         next.arch_info_irqsave().restore_gsbase();
419         // 将下一个进程的kstack写入kernel_gsbase
420         next.arch_info_irqsave().store_kernel_gsbase();
421         asm!("swapgs", options(nostack, preserves_flags));
422     }
423 }
424 
425 /// 保存上下文,然后切换进程,接着jmp到`switch_finish_hook`钩子函数
426 #[naked]
427 unsafe extern "sysv64" fn switch_to_inner(prev: *mut ArchPCBInfo, next: *mut ArchPCBInfo) {
428     asm!(
429         // As a quick reminder for those who are unfamiliar with the System V ABI (extern "C"):
430         //
431         // - the current parameters are passed in the registers `rdi`, `rsi`,
432         // - we can modify scratch registers, e.g. rax
433         // - we cannot change callee-preserved registers arbitrarily, e.g. rbx, which is why we
434         //   store them here in the first place.
435         concat!("
436         // Save old registers, and load new ones
437         mov [rdi + {off_rbx}], rbx
438         mov rbx, [rsi + {off_rbx}]
439 
440         mov [rdi + {off_r12}], r12
441         mov r12, [rsi + {off_r12}]
442 
443         mov [rdi + {off_r13}], r13
444         mov r13, [rsi + {off_r13}]
445 
446         mov [rdi + {off_r14}], r14
447         mov r14, [rsi + {off_r14}]
448 
449         mov [rdi + {off_r15}], r15
450         mov r15, [rsi + {off_r15}]
451 
452         // switch segment registers (这些寄存器只能通过接下来的switch_hook的return来切换)
453         mov [rdi + {off_fs}], fs
454         mov [rdi + {off_gs}], gs
455 
456         // mov fs, [rsi + {off_fs}]
457         // mov gs, [rsi + {off_gs}]
458 
459         mov [rdi + {off_rbp}], rbp
460         mov rbp, [rsi + {off_rbp}]
461 
462         mov [rdi + {off_rsp}], rsp
463         mov rsp, [rsi + {off_rsp}]
464 
465         // // push RFLAGS (can only be modified via stack)
466         pushfq
467         // // pop RFLAGS into `self.rflags`
468         pop QWORD PTR [rdi + {off_rflags}]
469 
470         // // push `next.rflags`
471         push QWORD PTR [rsi + {off_rflags}]
472         // // pop into RFLAGS
473         popfq
474 
475         // push next rip to stack
476         push QWORD PTR [rsi + {off_rip}]
477 
478 
479         // When we return, we cannot even guarantee that the return address on the stack, points to
480         // the calling function. Thus, we have to execute this Rust hook by
481         // ourselves, which will unlock the contexts before the later switch.
482 
483         // Note that switch_finish_hook will be responsible for executing `ret`.
484         jmp {switch_hook}
485         "),
486 
487         off_rflags = const(offset_of!(ArchPCBInfo, rflags)),
488 
489         off_rbx = const(offset_of!(ArchPCBInfo, rbx)),
490         off_r12 = const(offset_of!(ArchPCBInfo, r12)),
491         off_r13 = const(offset_of!(ArchPCBInfo, r13)),
492         off_r14 = const(offset_of!(ArchPCBInfo, r14)),
493         off_rbp = const(offset_of!(ArchPCBInfo, rbp)),
494         off_rsp = const(offset_of!(ArchPCBInfo, rsp)),
495         off_r15 = const(offset_of!(ArchPCBInfo, r15)),
496         off_rip = const(offset_of!(ArchPCBInfo, rip)),
497         off_fs = const(offset_of!(ArchPCBInfo, fs)),
498         off_gs = const(offset_of!(ArchPCBInfo, gs)),
499 
500         switch_hook = sym crate::process::switch_finish_hook,
501         options(noreturn),
502     );
503 }
504 
505 #[naked]
506 unsafe extern "sysv64" fn switch_back() -> ! {
507     asm!("ret", options(noreturn));
508 }
509 
510 pub unsafe fn arch_switch_to_user(trap_frame: TrapFrame) -> ! {
511     // 以下代码不能发生中断
512     CurrentIrqArch::interrupt_disable();
513 
514     let current_pcb = ProcessManager::current_pcb();
515     let trap_frame_vaddr = VirtAddr::new(
516         current_pcb.kernel_stack().stack_max_address().data() - core::mem::size_of::<TrapFrame>(),
517     );
518     // debug!("trap_frame_vaddr: {:?}", trap_frame_vaddr);
519 
520     assert!(
521         (x86::current::registers::rsp() as usize) < trap_frame_vaddr.data(),
522         "arch_switch_to_user(): current_rsp >= fake trap
523         frame vaddr, this may cause some illegal access to memory!
524         rsp: {:#x}, trap_frame_vaddr: {:#x}",
525         x86::current::registers::rsp() as usize,
526         trap_frame_vaddr.data()
527     );
528 
529     let new_rip = VirtAddr::new(ret_from_intr as usize);
530     let mut arch_guard = current_pcb.arch_info_irqsave();
531     arch_guard.rsp = trap_frame_vaddr.data();
532 
533     arch_guard.fs = USER_DS;
534     arch_guard.gs = USER_DS;
535 
536     // 将内核gs数据压进cpu
537     arch_guard.store_kernel_gsbase();
538 
539     switch_fs_and_gs(
540         SegmentSelector::from_bits_truncate(arch_guard.fs.bits()),
541         SegmentSelector::from_bits_truncate(arch_guard.gs.bits()),
542     );
543     arch_guard.rip = new_rip.data();
544 
545     drop(arch_guard);
546 
547     drop(current_pcb);
548     compiler_fence(Ordering::SeqCst);
549 
550     // 重要!在这里之后,一定要保证上面的引用计数变量、动态申请的变量、锁的守卫都被drop了,否则可能导致内存安全问题!
551 
552     compiler_fence(Ordering::SeqCst);
553     ready_to_switch_to_user(trap_frame, trap_frame_vaddr.data(), new_rip.data());
554 }
555 
556 /// 由于需要依赖ret来切换到用户态,所以不能inline
557 #[inline(never)]
558 unsafe extern "sysv64" fn ready_to_switch_to_user(
559     trap_frame: TrapFrame,
560     trapframe_vaddr: usize,
561     new_rip: usize,
562 ) -> ! {
563     *(trapframe_vaddr as *mut TrapFrame) = trap_frame;
564     compiler_fence(Ordering::SeqCst);
565     asm!(
566         "swapgs",
567         "mov rsp, {trapframe_vaddr}",
568         "push {new_rip}",
569         "ret",
570         trapframe_vaddr = in(reg) trapframe_vaddr,
571         new_rip = in(reg) new_rip
572     );
573     unreachable!()
574 }
575 
576 // bitflags! {
577 //     pub struct ProcessThreadFlags: u32 {
578 //     /*
579 //     * thread information flags
580 //     * - these are process state flags that various assembly files
581 //     *   may need to access
582 //     */
583 //     const TIF_NOTIFY_RESUME	= 1 << 1;	/* callback before returning to user */
584 //     const TIF_SIGPENDING	=	1 << 2;	/* signal pending */
585 //     const TIF_NEED_RESCHED	= 1 << 3;	/* rescheduling necessary */
586 //     const TIF_SINGLESTEP	=	1 << 4;	/* reenable singlestep on user return*/
587 //     const TIF_SSBD		= 1 << 5;	/* Speculative store bypass disable */
588 //     const TIF_SPEC_IB		= 1 << 9;	/* Indirect branch speculation mitigation */
589 //     const TIF_SPEC_L1D_FLUSH	= 1 << 10;	/* Flush L1D on mm switches (processes) */
590 //     const TIF_USER_RETURN_NOTIFY	= 1 << 11;	/* notify kernel of userspace return */
591 //     const TIF_UPROBE		= 1 << 12;	/* breakpointed or singlestepping */
592 //     const TIF_PATCH_PENDING	= 1 << 13;	/* pending live patching update */
593 //     const TIF_NEED_FPU_LOAD	= 1 << 14;	/* load FPU on return to userspace */
594 //     const TIF_NOCPUID		= 1 << 15;	/* CPUID is not accessible in userland */
595 //     const TIF_NOTSC		= 1 << 16;	/* TSC is not accessible in userland */
596 //     const TIF_NOTIFY_SIGNAL	= 1 << 17;	/* signal notifications exist */
597 //     const TIF_MEMDIE		= 1 << 20;	/* is terminating due to OOM killer */
598 //     const TIF_POLLING_NRFLAG	= 1 << 21;	/* idle is polling for TIF_NEED_RESCHED */
599 //     const TIF_IO_BITMAP		= 1 << 22;	/* uses I/O bitmap */
600 //     const TIF_SPEC_FORCE_UPDATE	= 1 << 23;	/* Force speculation MSR update in context switch */
601 //     const TIF_FORCED_TF		= 1 << 24;	/* true if TF in eflags artificially */
602 //     const TIF_BLOCKSTEP		= 1 << 25;	/* set when we want DEBUGCTLMSR_BTF */
603 //     const TIF_LAZY_MMU_UPDATES	= 1 << 27;	/* task is updating the mmu lazily */
604 //     const TIF_ADDR32		= 1 << 29;	/* 32-bit address space on 64 bits */
605 //     }
606 // }
607