xref: /DragonOS/kernel/src/arch/x86_64/kvm/vmx/mmu.rs (revision 91e9d4ab55ef960f57a1b6287bc523ca4341f67a)
140314b30SXiaoye Zheng use crate::{
240314b30SXiaoye Zheng     arch::kvm::vmx::ept::EptMapper,
340314b30SXiaoye Zheng     kdebug,
440314b30SXiaoye Zheng     libs::mutex::Mutex,
540314b30SXiaoye Zheng     mm::{page::PageFlags, syscall::ProtFlags},
640314b30SXiaoye Zheng     virt::kvm::host_mem::{__gfn_to_pfn, kvm_vcpu_gfn_to_memslot, PAGE_MASK, PAGE_SHIFT},
740314b30SXiaoye Zheng };
840314b30SXiaoye Zheng use bitfield_struct::bitfield;
9*91e9d4abSLoGin use system_error::SystemError;
1040314b30SXiaoye Zheng 
1140314b30SXiaoye Zheng use super::{
1240314b30SXiaoye Zheng     ept::check_ept_features,
1340314b30SXiaoye Zheng     vcpu::VmxVcpu,
1440314b30SXiaoye Zheng     vmcs::VmcsFields,
1540314b30SXiaoye Zheng     vmx_asm_wrapper::{vmx_vmread, vmx_vmwrite},
1640314b30SXiaoye Zheng };
1740314b30SXiaoye Zheng use crate::arch::kvm::vmx::mmu::VmcsFields::CTRL_EPTP_PTR;
1840314b30SXiaoye Zheng 
1940314b30SXiaoye Zheng // pub const PT64_ROOT_LEVEL: u32 = 4;
2040314b30SXiaoye Zheng // pub const PT32_ROOT_LEVEL: u32 = 2;
2140314b30SXiaoye Zheng // pub const PT32E_ROOT_LEVEL: u32 = 3;
2240314b30SXiaoye Zheng 
2340314b30SXiaoye Zheng // pub struct KvmMmuPage{
2440314b30SXiaoye Zheng //     gfn: u64, // 管理地址范围的起始地址对应的 gfn
2540314b30SXiaoye Zheng //     role: KvmMmuPageRole, // 基本信息,包括硬件特性和所属层级等
2640314b30SXiaoye Zheng //     // spt: *mut u64, // spt: shadow page table,指向 struct page 的地址,其包含了所有页表项 (pte)。同时 page->private 会指向该 kvm_mmu_page
2740314b30SXiaoye Zheng // }
2840314b30SXiaoye Zheng 
2940314b30SXiaoye Zheng #[bitfield(u32)]
3040314b30SXiaoye Zheng pub struct KvmMmuPageRole {
3140314b30SXiaoye Zheng     #[bits(4)]
3240314b30SXiaoye Zheng     level: usize, // 页所处的层级
3340314b30SXiaoye Zheng     cr4_pae: bool, // cr4.pae,1 表示使用 64bit gpte
3440314b30SXiaoye Zheng     #[bits(2)]
3540314b30SXiaoye Zheng     quadrant: usize, // 如果 cr4.pae=0,则 gpte 为 32bit,但 spte 为 64bit,因此需要用多个 spte 来表示一个 gpte,该字段指示是 gpte 的第几块
3640314b30SXiaoye Zheng     direct: bool,
3740314b30SXiaoye Zheng     #[bits(3)]
3840314b30SXiaoye Zheng     access: usize, // 访问权限
3940314b30SXiaoye Zheng     invalid: bool,        // 失效,一旦 unpin 就会被销毁
4040314b30SXiaoye Zheng     nxe: bool,            // efer.nxe,不可执行
4140314b30SXiaoye Zheng     cr0_wp: bool,         // cr0.wp, 写保护
4240314b30SXiaoye Zheng     smep_andnot_wp: bool, // smep && !cr0.wp,SMEP启用,用户模式代码将无法执行位于内核地址空间中的指令。
4340314b30SXiaoye Zheng     smap_andnot_wp: bool, // smap && !cr0.wp
4440314b30SXiaoye Zheng     #[bits(8)]
4540314b30SXiaoye Zheng     unused: usize,
4640314b30SXiaoye Zheng     #[bits(8)]
4740314b30SXiaoye Zheng     smm: usize, // 1 表示处于 system management mode, 0 表示非 SMM
4840314b30SXiaoye Zheng }
4940314b30SXiaoye Zheng 
5040314b30SXiaoye Zheng //  We don't want allocation failures within the mmu code, so we preallocate
5140314b30SXiaoye Zheng // enough memory for a single page fault in a cache.
5240314b30SXiaoye Zheng // pub struct KvmMmuMemoryCache {
5340314b30SXiaoye Zheng //     num_objs: u32,
5440314b30SXiaoye Zheng //     objs: [*mut u8; KVM_NR_MEM_OBJS as usize],
5540314b30SXiaoye Zheng // }
5640314b30SXiaoye Zheng 
5740314b30SXiaoye Zheng #[derive(Default)]
5840314b30SXiaoye Zheng pub struct KvmMmu {
5940314b30SXiaoye Zheng     pub root_hpa: u64,
6040314b30SXiaoye Zheng     pub root_level: u32,
6140314b30SXiaoye Zheng     pub base_role: KvmMmuPageRole,
6240314b30SXiaoye Zheng     // ...还有一些变量不知道用来做什么
6340314b30SXiaoye Zheng     pub get_cr3: Option<fn(&VmxVcpu) -> u64>,
6440314b30SXiaoye Zheng     pub set_eptp: Option<fn(u64) -> Result<(), SystemError>>,
6540314b30SXiaoye Zheng     pub page_fault: Option<
6640314b30SXiaoye Zheng         fn(
6740314b30SXiaoye Zheng             vcpu: &mut VmxVcpu,
6840314b30SXiaoye Zheng             gpa: u64,
6940314b30SXiaoye Zheng             error_code: u32,
7040314b30SXiaoye Zheng             prefault: bool,
7140314b30SXiaoye Zheng         ) -> Result<(), SystemError>,
7240314b30SXiaoye Zheng     >,
7340314b30SXiaoye Zheng     // get_pdptr: Option<fn(& VmxVcpu, index:u32) -> u64>, // Page Directory Pointer Table Register?暂时不知道和CR3的区别是什么
7440314b30SXiaoye Zheng     // inject_page_fault: Option<fn(&mut VmxVcpu, fault: &X86Exception)>,
7540314b30SXiaoye Zheng     // gva_to_gpa: Option<fn(&mut VmxVcpu, gva: u64, access: u32, exception: &X86Exception) -> u64>,
7640314b30SXiaoye Zheng     // translate_gpa: Option<fn(&mut VmxVcpu, gpa: u64, access: u32, exception: &X86Exception) -> u64>,
7740314b30SXiaoye Zheng     // sync_page: Option<fn(&mut VmxVcpu, &mut KvmMmuPage)>,
7840314b30SXiaoye Zheng     // invlpg: Option<fn(&mut VmxVcpu, gva: u64)>, // invalid entry
7940314b30SXiaoye Zheng     // update_pte: Option<fn(&mut VmxVcpu, sp: &KvmMmuPage, spte: u64, pte: u64)>,
8040314b30SXiaoye Zheng }
8140314b30SXiaoye Zheng 
8240314b30SXiaoye Zheng impl core::fmt::Debug for KvmMmu {
8340314b30SXiaoye Zheng     fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
8440314b30SXiaoye Zheng         f.debug_struct("KvmMmu")
8540314b30SXiaoye Zheng             .field("root_hpa", &self.root_hpa)
8640314b30SXiaoye Zheng             .field("root_level", &self.root_level)
8740314b30SXiaoye Zheng             .field("base_role", &self.base_role)
8840314b30SXiaoye Zheng             .finish()
8940314b30SXiaoye Zheng     }
9040314b30SXiaoye Zheng }
9140314b30SXiaoye Zheng 
9240314b30SXiaoye Zheng fn tdp_get_cr3(_vcpu: &VmxVcpu) -> u64 {
9340314b30SXiaoye Zheng     let guest_cr3 = vmx_vmread(VmcsFields::GUEST_CR3 as u32).expect("Failed to read eptp");
9440314b30SXiaoye Zheng     return guest_cr3;
9540314b30SXiaoye Zheng }
9640314b30SXiaoye Zheng 
9740314b30SXiaoye Zheng fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> {
9840314b30SXiaoye Zheng     // 设置权限位,目前是写死的,可读可写可执行
9940314b30SXiaoye Zheng     //  EPT paging-structure memory type: Uncacheable
10040314b30SXiaoye Zheng     let mut eptp = 0x0 as u64;
10140314b30SXiaoye Zheng     // This value is 1 less than the EPT page-walk length.  3 means 4-level paging.
10240314b30SXiaoye Zheng     eptp |= 0x3 << 3;
10340314b30SXiaoye Zheng     eptp |= root_hpa & (PAGE_MASK as u64);
10440314b30SXiaoye Zheng     vmx_vmwrite(CTRL_EPTP_PTR as u32, eptp)?;
10540314b30SXiaoye Zheng     Ok(())
10640314b30SXiaoye Zheng }
10740314b30SXiaoye Zheng 
10840314b30SXiaoye Zheng fn tdp_page_fault(
10940314b30SXiaoye Zheng     vcpu: &mut VmxVcpu,
11040314b30SXiaoye Zheng     gpa: u64,
11140314b30SXiaoye Zheng     error_code: u32,
11240314b30SXiaoye Zheng     prefault: bool,
11340314b30SXiaoye Zheng ) -> Result<(), SystemError> {
11440314b30SXiaoye Zheng     kdebug!("tdp_page_fault");
11540314b30SXiaoye Zheng     let gfn = gpa >> PAGE_SHIFT; // 物理地址右移12位得到物理页框号(相对于虚拟机而言)
11640314b30SXiaoye Zheng                                  // 分配缓存池,为了避免在运行时分配空间失败,这里提前分配/填充足额的空间
11740314b30SXiaoye Zheng     mmu_topup_memory_caches(vcpu)?;
11840314b30SXiaoye Zheng     // TODO:获取gfn使用的level,处理hugepage的问题
11940314b30SXiaoye Zheng     let level = 1; // 4KB page
12040314b30SXiaoye Zheng                    // TODO: 快速处理由读写操作引起violation,即present同时有写权限的非mmio page fault
12140314b30SXiaoye Zheng                    // fast_page_fault(vcpu, gpa, level, error_code)
12240314b30SXiaoye Zheng                    // gfn->pfn
12340314b30SXiaoye Zheng     let mut map_writable = false;
12440314b30SXiaoye Zheng     let write = error_code & ((1 as u32) << 1);
12540314b30SXiaoye Zheng     let pfn = mmu_gfn_to_pfn_fast(vcpu, gpa, prefault, gfn, write == 0, &mut map_writable)?;
12640314b30SXiaoye Zheng     // direct map就是映射ept页表的过程
12740314b30SXiaoye Zheng     __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault)?;
12840314b30SXiaoye Zheng     Ok(())
12940314b30SXiaoye Zheng }
13040314b30SXiaoye Zheng 
13140314b30SXiaoye Zheng /*
13240314b30SXiaoye Zheng  * Caculate mmu pages needed for kvm.
13340314b30SXiaoye Zheng  */
13440314b30SXiaoye Zheng // pub fn kvm_mmu_calculate_mmu_pages() -> u32 {
13540314b30SXiaoye Zheng // 	let mut nr_mmu_pages:u32;
13640314b30SXiaoye Zheng //     let mut nr_pages = 0;
13740314b30SXiaoye Zheng 
13840314b30SXiaoye Zheng //     let kvm = vm(0).unwrap();
13940314b30SXiaoye Zheng //     for as_id in 0..KVM_ADDRESS_SPACE_NUM {
14040314b30SXiaoye Zheng //         let slots = kvm.memslots[as_id];
14140314b30SXiaoye Zheng //         for i in 0..KVM_MEM_SLOTS_NUM {
14240314b30SXiaoye Zheng //             let memslot = slots.memslots[i as usize];
14340314b30SXiaoye Zheng //             nr_pages += memslot.npages;
14440314b30SXiaoye Zheng //         }
14540314b30SXiaoye Zheng //     }
14640314b30SXiaoye Zheng 
14740314b30SXiaoye Zheng // 	nr_mmu_pages = (nr_pages as u32)* KVM_PERMILLE_MMU_PAGES / 1000;
14840314b30SXiaoye Zheng // 	nr_mmu_pages = nr_mmu_pages.max(KVM_MIN_ALLOC_MMU_PAGES);
14940314b30SXiaoye Zheng // 	return nr_mmu_pages;
15040314b30SXiaoye Zheng // }
15140314b30SXiaoye Zheng 
15240314b30SXiaoye Zheng // pub fn kvm_mmu_change_mmu_pages(mut goal_nr_mmu_pages: u32){
15340314b30SXiaoye Zheng //     let kvm = KVM();
15440314b30SXiaoye Zheng //     // 释放多余的mmu page
15540314b30SXiaoye Zheng //     if kvm.lock().arch.n_used_mmu_pages > goal_nr_mmu_pages {
15640314b30SXiaoye Zheng //         while kvm.lock().arch.n_used_mmu_pages > goal_nr_mmu_pages {
15740314b30SXiaoye Zheng //             if !prepare_zap_oldest_mmu_page() {
15840314b30SXiaoye Zheng //                 break;
15940314b30SXiaoye Zheng //             }
16040314b30SXiaoye Zheng //         }
16140314b30SXiaoye Zheng //         // kvm_mmu_commit_zap_page();
16240314b30SXiaoye Zheng //         goal_nr_mmu_pages = kvm.lock().arch.n_used_mmu_pages;
16340314b30SXiaoye Zheng 
16440314b30SXiaoye Zheng //     }
16540314b30SXiaoye Zheng //     kvm.lock().arch.n_max_mmu_pages = goal_nr_mmu_pages;
16640314b30SXiaoye Zheng // }
16740314b30SXiaoye Zheng 
16840314b30SXiaoye Zheng // pub fn prepare_zap_oldest_mmu_page() -> bool {
16940314b30SXiaoye Zheng //     return false;
17040314b30SXiaoye Zheng // }
17140314b30SXiaoye Zheng 
17240314b30SXiaoye Zheng pub fn kvm_mmu_setup(vcpu: &Mutex<VmxVcpu>) {
17340314b30SXiaoye Zheng     // TODO: init_kvm_softmmu(vcpu), init_kvm_nested_mmu(vcpu)
17440314b30SXiaoye Zheng     init_kvm_tdp_mmu(vcpu);
17540314b30SXiaoye Zheng }
17640314b30SXiaoye Zheng 
17740314b30SXiaoye Zheng pub fn kvm_vcpu_mtrr_init(_vcpu: &Mutex<VmxVcpu>) -> Result<(), SystemError> {
17840314b30SXiaoye Zheng     check_ept_features()?;
17940314b30SXiaoye Zheng     Ok(())
18040314b30SXiaoye Zheng }
18140314b30SXiaoye Zheng 
18240314b30SXiaoye Zheng pub fn init_kvm_tdp_mmu(vcpu: &Mutex<VmxVcpu>) {
18340314b30SXiaoye Zheng     let context = &mut vcpu.lock().mmu;
18440314b30SXiaoye Zheng     context.page_fault = Some(tdp_page_fault);
18540314b30SXiaoye Zheng     context.get_cr3 = Some(tdp_get_cr3);
18640314b30SXiaoye Zheng     context.set_eptp = Some(tdp_set_eptp);
18740314b30SXiaoye Zheng     // context.inject_page_fault = kvm_inject_page_fault; TODO: inject_page_fault
18840314b30SXiaoye Zheng     // context.invlpg = nonpaging_invlpg;
18940314b30SXiaoye Zheng     // context.sync_page = nonpaging_sync_page;
19040314b30SXiaoye Zheng     // context.update_pte = nonpaging_update_pte;
19140314b30SXiaoye Zheng 
19240314b30SXiaoye Zheng     // TODO: gva to gpa in kvm
19340314b30SXiaoye Zheng     // if !is_paging(vcpu) { // vcpu不分页
19440314b30SXiaoye Zheng     //     context.gva_to_gpa = nonpaging_gva_to_gpa;
19540314b30SXiaoye Zheng     // 	context.root_level = 0;
19640314b30SXiaoye Zheng     // } else if (is_long_mode(vcpu)) {
19740314b30SXiaoye Zheng     // 	context.gva_to_gpa = paging64_gva_to_gpa;
19840314b30SXiaoye Zheng     // 	context.root_level = PT64_ROOT_LEVEL;
19940314b30SXiaoye Zheng     // TODO:: different paging strategy
20040314b30SXiaoye Zheng     // } else if (is_pae(vcpu)) {
20140314b30SXiaoye Zheng     //     context.gva_to_gpa = paging64_gva_to_gpa;
20240314b30SXiaoye Zheng     //     context.root_level = PT32E_ROOT_LEVEL;
20340314b30SXiaoye Zheng     // } else {
20440314b30SXiaoye Zheng     //     context.gva_to_gpa = paging32_gva_to_gpa;
20540314b30SXiaoye Zheng     //     context.root_level = PT32_ROOT_LEVEL;
20640314b30SXiaoye Zheng     // }
20740314b30SXiaoye Zheng }
20840314b30SXiaoye Zheng 
20940314b30SXiaoye Zheng pub fn __direct_map(
21040314b30SXiaoye Zheng     vcpu: &mut VmxVcpu,
21140314b30SXiaoye Zheng     gpa: u64,
21240314b30SXiaoye Zheng     _write: u32,
21340314b30SXiaoye Zheng     _map_writable: bool,
21440314b30SXiaoye Zheng     _level: i32,
21540314b30SXiaoye Zheng     _gfn: u64,
21640314b30SXiaoye Zheng     pfn: u64,
21740314b30SXiaoye Zheng     _prefault: bool,
21840314b30SXiaoye Zheng ) -> Result<u32, SystemError> {
21940314b30SXiaoye Zheng     kdebug!("gpa={}, pfn={}, root_hpa={:x}", gpa, pfn, vcpu.mmu.root_hpa);
22040314b30SXiaoye Zheng     // 判断vcpu.mmu.root_hpa是否有效
22140314b30SXiaoye Zheng     if vcpu.mmu.root_hpa == 0 {
22240314b30SXiaoye Zheng         return Err(SystemError::KVM_HVA_ERR_BAD);
22340314b30SXiaoye Zheng     }
22440314b30SXiaoye Zheng     // 把gpa映射到hpa
22540314b30SXiaoye Zheng     let mut ept_mapper = EptMapper::lock();
22640314b30SXiaoye Zheng     let page_flags = PageFlags::from_prot_flags(ProtFlags::from_bits_truncate(0x7 as u64), false);
22740314b30SXiaoye Zheng     unsafe {
22840314b30SXiaoye Zheng         assert!(ept_mapper.walk(gpa, pfn << PAGE_SHIFT, page_flags).is_ok());
22940314b30SXiaoye Zheng     }
23040314b30SXiaoye Zheng     drop(ept_mapper);
23140314b30SXiaoye Zheng     return Ok(0);
23240314b30SXiaoye Zheng }
23340314b30SXiaoye Zheng 
23440314b30SXiaoye Zheng pub fn mmu_gfn_to_pfn_fast(
23540314b30SXiaoye Zheng     vcpu: &mut VmxVcpu,
23640314b30SXiaoye Zheng     _gpa: u64,
23740314b30SXiaoye Zheng     _prefault: bool,
23840314b30SXiaoye Zheng     gfn: u64,
23940314b30SXiaoye Zheng     write: bool,
24040314b30SXiaoye Zheng     writable: &mut bool,
24140314b30SXiaoye Zheng ) -> Result<u64, SystemError> {
24240314b30SXiaoye Zheng     let slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
24340314b30SXiaoye Zheng     let pfn = __gfn_to_pfn(slot, gfn, false, write, writable)?;
24440314b30SXiaoye Zheng     Ok(pfn)
24540314b30SXiaoye Zheng }
24640314b30SXiaoye Zheng 
24740314b30SXiaoye Zheng // TODO: 添加cache
24840314b30SXiaoye Zheng pub fn mmu_topup_memory_caches(_vcpu: &mut VmxVcpu) -> Result<(), SystemError> {
24940314b30SXiaoye Zheng     // 如果 vcpu->arch.mmu_page_header_cache 不足,从 mmu_page_header_cache 中分配
25040314b30SXiaoye Zheng     // pte_list_desc_cache 和 mmu_page_header_cache 两块全局 slab cache 在 kvm_mmu_module_init 中被创建
25140314b30SXiaoye Zheng     // mmu_topup_memory_cache(vcpu.mmu_page_header_cache,
25240314b30SXiaoye Zheng     //     mmu_page_header_cache, 4);
25340314b30SXiaoye Zheng     Ok(())
25440314b30SXiaoye Zheng }
255