140314b30SXiaoye Zheng use crate::{ 240314b30SXiaoye Zheng arch::kvm::vmx::ept::EptMapper, 340314b30SXiaoye Zheng kdebug, 440314b30SXiaoye Zheng libs::mutex::Mutex, 540314b30SXiaoye Zheng mm::{page::PageFlags, syscall::ProtFlags}, 640314b30SXiaoye Zheng virt::kvm::host_mem::{__gfn_to_pfn, kvm_vcpu_gfn_to_memslot, PAGE_MASK, PAGE_SHIFT}, 740314b30SXiaoye Zheng }; 840314b30SXiaoye Zheng use bitfield_struct::bitfield; 9*91e9d4abSLoGin use system_error::SystemError; 1040314b30SXiaoye Zheng 1140314b30SXiaoye Zheng use super::{ 1240314b30SXiaoye Zheng ept::check_ept_features, 1340314b30SXiaoye Zheng vcpu::VmxVcpu, 1440314b30SXiaoye Zheng vmcs::VmcsFields, 1540314b30SXiaoye Zheng vmx_asm_wrapper::{vmx_vmread, vmx_vmwrite}, 1640314b30SXiaoye Zheng }; 1740314b30SXiaoye Zheng use crate::arch::kvm::vmx::mmu::VmcsFields::CTRL_EPTP_PTR; 1840314b30SXiaoye Zheng 1940314b30SXiaoye Zheng // pub const PT64_ROOT_LEVEL: u32 = 4; 2040314b30SXiaoye Zheng // pub const PT32_ROOT_LEVEL: u32 = 2; 2140314b30SXiaoye Zheng // pub const PT32E_ROOT_LEVEL: u32 = 3; 2240314b30SXiaoye Zheng 2340314b30SXiaoye Zheng // pub struct KvmMmuPage{ 2440314b30SXiaoye Zheng // gfn: u64, // 管理地址范围的起始地址对应的 gfn 2540314b30SXiaoye Zheng // role: KvmMmuPageRole, // 基本信息,包括硬件特性和所属层级等 2640314b30SXiaoye Zheng // // spt: *mut u64, // spt: shadow page table,指向 struct page 的地址,其包含了所有页表项 (pte)。同时 page->private 会指向该 kvm_mmu_page 2740314b30SXiaoye Zheng // } 2840314b30SXiaoye Zheng 2940314b30SXiaoye Zheng #[bitfield(u32)] 3040314b30SXiaoye Zheng pub struct KvmMmuPageRole { 3140314b30SXiaoye Zheng #[bits(4)] 3240314b30SXiaoye Zheng level: usize, // 页所处的层级 3340314b30SXiaoye Zheng cr4_pae: bool, // cr4.pae,1 表示使用 64bit gpte 3440314b30SXiaoye Zheng #[bits(2)] 3540314b30SXiaoye Zheng quadrant: usize, // 如果 cr4.pae=0,则 gpte 为 32bit,但 spte 为 64bit,因此需要用多个 spte 来表示一个 gpte,该字段指示是 gpte 的第几块 3640314b30SXiaoye Zheng direct: bool, 3740314b30SXiaoye Zheng #[bits(3)] 3840314b30SXiaoye Zheng access: usize, // 访问权限 3940314b30SXiaoye Zheng invalid: bool, // 失效,一旦 unpin 就会被销毁 4040314b30SXiaoye Zheng nxe: bool, // efer.nxe,不可执行 4140314b30SXiaoye Zheng cr0_wp: bool, // cr0.wp, 写保护 4240314b30SXiaoye Zheng smep_andnot_wp: bool, // smep && !cr0.wp,SMEP启用,用户模式代码将无法执行位于内核地址空间中的指令。 4340314b30SXiaoye Zheng smap_andnot_wp: bool, // smap && !cr0.wp 4440314b30SXiaoye Zheng #[bits(8)] 4540314b30SXiaoye Zheng unused: usize, 4640314b30SXiaoye Zheng #[bits(8)] 4740314b30SXiaoye Zheng smm: usize, // 1 表示处于 system management mode, 0 表示非 SMM 4840314b30SXiaoye Zheng } 4940314b30SXiaoye Zheng 5040314b30SXiaoye Zheng // We don't want allocation failures within the mmu code, so we preallocate 5140314b30SXiaoye Zheng // enough memory for a single page fault in a cache. 5240314b30SXiaoye Zheng // pub struct KvmMmuMemoryCache { 5340314b30SXiaoye Zheng // num_objs: u32, 5440314b30SXiaoye Zheng // objs: [*mut u8; KVM_NR_MEM_OBJS as usize], 5540314b30SXiaoye Zheng // } 5640314b30SXiaoye Zheng 5740314b30SXiaoye Zheng #[derive(Default)] 5840314b30SXiaoye Zheng pub struct KvmMmu { 5940314b30SXiaoye Zheng pub root_hpa: u64, 6040314b30SXiaoye Zheng pub root_level: u32, 6140314b30SXiaoye Zheng pub base_role: KvmMmuPageRole, 6240314b30SXiaoye Zheng // ...还有一些变量不知道用来做什么 6340314b30SXiaoye Zheng pub get_cr3: Option<fn(&VmxVcpu) -> u64>, 6440314b30SXiaoye Zheng pub set_eptp: Option<fn(u64) -> Result<(), SystemError>>, 6540314b30SXiaoye Zheng pub page_fault: Option< 6640314b30SXiaoye Zheng fn( 6740314b30SXiaoye Zheng vcpu: &mut VmxVcpu, 6840314b30SXiaoye Zheng gpa: u64, 6940314b30SXiaoye Zheng error_code: u32, 7040314b30SXiaoye Zheng prefault: bool, 7140314b30SXiaoye Zheng ) -> Result<(), SystemError>, 7240314b30SXiaoye Zheng >, 7340314b30SXiaoye Zheng // get_pdptr: Option<fn(& VmxVcpu, index:u32) -> u64>, // Page Directory Pointer Table Register?暂时不知道和CR3的区别是什么 7440314b30SXiaoye Zheng // inject_page_fault: Option<fn(&mut VmxVcpu, fault: &X86Exception)>, 7540314b30SXiaoye Zheng // gva_to_gpa: Option<fn(&mut VmxVcpu, gva: u64, access: u32, exception: &X86Exception) -> u64>, 7640314b30SXiaoye Zheng // translate_gpa: Option<fn(&mut VmxVcpu, gpa: u64, access: u32, exception: &X86Exception) -> u64>, 7740314b30SXiaoye Zheng // sync_page: Option<fn(&mut VmxVcpu, &mut KvmMmuPage)>, 7840314b30SXiaoye Zheng // invlpg: Option<fn(&mut VmxVcpu, gva: u64)>, // invalid entry 7940314b30SXiaoye Zheng // update_pte: Option<fn(&mut VmxVcpu, sp: &KvmMmuPage, spte: u64, pte: u64)>, 8040314b30SXiaoye Zheng } 8140314b30SXiaoye Zheng 8240314b30SXiaoye Zheng impl core::fmt::Debug for KvmMmu { 8340314b30SXiaoye Zheng fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { 8440314b30SXiaoye Zheng f.debug_struct("KvmMmu") 8540314b30SXiaoye Zheng .field("root_hpa", &self.root_hpa) 8640314b30SXiaoye Zheng .field("root_level", &self.root_level) 8740314b30SXiaoye Zheng .field("base_role", &self.base_role) 8840314b30SXiaoye Zheng .finish() 8940314b30SXiaoye Zheng } 9040314b30SXiaoye Zheng } 9140314b30SXiaoye Zheng 9240314b30SXiaoye Zheng fn tdp_get_cr3(_vcpu: &VmxVcpu) -> u64 { 9340314b30SXiaoye Zheng let guest_cr3 = vmx_vmread(VmcsFields::GUEST_CR3 as u32).expect("Failed to read eptp"); 9440314b30SXiaoye Zheng return guest_cr3; 9540314b30SXiaoye Zheng } 9640314b30SXiaoye Zheng 9740314b30SXiaoye Zheng fn tdp_set_eptp(root_hpa: u64) -> Result<(), SystemError> { 9840314b30SXiaoye Zheng // 设置权限位,目前是写死的,可读可写可执行 9940314b30SXiaoye Zheng // EPT paging-structure memory type: Uncacheable 10040314b30SXiaoye Zheng let mut eptp = 0x0 as u64; 10140314b30SXiaoye Zheng // This value is 1 less than the EPT page-walk length. 3 means 4-level paging. 10240314b30SXiaoye Zheng eptp |= 0x3 << 3; 10340314b30SXiaoye Zheng eptp |= root_hpa & (PAGE_MASK as u64); 10440314b30SXiaoye Zheng vmx_vmwrite(CTRL_EPTP_PTR as u32, eptp)?; 10540314b30SXiaoye Zheng Ok(()) 10640314b30SXiaoye Zheng } 10740314b30SXiaoye Zheng 10840314b30SXiaoye Zheng fn tdp_page_fault( 10940314b30SXiaoye Zheng vcpu: &mut VmxVcpu, 11040314b30SXiaoye Zheng gpa: u64, 11140314b30SXiaoye Zheng error_code: u32, 11240314b30SXiaoye Zheng prefault: bool, 11340314b30SXiaoye Zheng ) -> Result<(), SystemError> { 11440314b30SXiaoye Zheng kdebug!("tdp_page_fault"); 11540314b30SXiaoye Zheng let gfn = gpa >> PAGE_SHIFT; // 物理地址右移12位得到物理页框号(相对于虚拟机而言) 11640314b30SXiaoye Zheng // 分配缓存池,为了避免在运行时分配空间失败,这里提前分配/填充足额的空间 11740314b30SXiaoye Zheng mmu_topup_memory_caches(vcpu)?; 11840314b30SXiaoye Zheng // TODO:获取gfn使用的level,处理hugepage的问题 11940314b30SXiaoye Zheng let level = 1; // 4KB page 12040314b30SXiaoye Zheng // TODO: 快速处理由读写操作引起violation,即present同时有写权限的非mmio page fault 12140314b30SXiaoye Zheng // fast_page_fault(vcpu, gpa, level, error_code) 12240314b30SXiaoye Zheng // gfn->pfn 12340314b30SXiaoye Zheng let mut map_writable = false; 12440314b30SXiaoye Zheng let write = error_code & ((1 as u32) << 1); 12540314b30SXiaoye Zheng let pfn = mmu_gfn_to_pfn_fast(vcpu, gpa, prefault, gfn, write == 0, &mut map_writable)?; 12640314b30SXiaoye Zheng // direct map就是映射ept页表的过程 12740314b30SXiaoye Zheng __direct_map(vcpu, gpa, write, map_writable, level, gfn, pfn, prefault)?; 12840314b30SXiaoye Zheng Ok(()) 12940314b30SXiaoye Zheng } 13040314b30SXiaoye Zheng 13140314b30SXiaoye Zheng /* 13240314b30SXiaoye Zheng * Caculate mmu pages needed for kvm. 13340314b30SXiaoye Zheng */ 13440314b30SXiaoye Zheng // pub fn kvm_mmu_calculate_mmu_pages() -> u32 { 13540314b30SXiaoye Zheng // let mut nr_mmu_pages:u32; 13640314b30SXiaoye Zheng // let mut nr_pages = 0; 13740314b30SXiaoye Zheng 13840314b30SXiaoye Zheng // let kvm = vm(0).unwrap(); 13940314b30SXiaoye Zheng // for as_id in 0..KVM_ADDRESS_SPACE_NUM { 14040314b30SXiaoye Zheng // let slots = kvm.memslots[as_id]; 14140314b30SXiaoye Zheng // for i in 0..KVM_MEM_SLOTS_NUM { 14240314b30SXiaoye Zheng // let memslot = slots.memslots[i as usize]; 14340314b30SXiaoye Zheng // nr_pages += memslot.npages; 14440314b30SXiaoye Zheng // } 14540314b30SXiaoye Zheng // } 14640314b30SXiaoye Zheng 14740314b30SXiaoye Zheng // nr_mmu_pages = (nr_pages as u32)* KVM_PERMILLE_MMU_PAGES / 1000; 14840314b30SXiaoye Zheng // nr_mmu_pages = nr_mmu_pages.max(KVM_MIN_ALLOC_MMU_PAGES); 14940314b30SXiaoye Zheng // return nr_mmu_pages; 15040314b30SXiaoye Zheng // } 15140314b30SXiaoye Zheng 15240314b30SXiaoye Zheng // pub fn kvm_mmu_change_mmu_pages(mut goal_nr_mmu_pages: u32){ 15340314b30SXiaoye Zheng // let kvm = KVM(); 15440314b30SXiaoye Zheng // // 释放多余的mmu page 15540314b30SXiaoye Zheng // if kvm.lock().arch.n_used_mmu_pages > goal_nr_mmu_pages { 15640314b30SXiaoye Zheng // while kvm.lock().arch.n_used_mmu_pages > goal_nr_mmu_pages { 15740314b30SXiaoye Zheng // if !prepare_zap_oldest_mmu_page() { 15840314b30SXiaoye Zheng // break; 15940314b30SXiaoye Zheng // } 16040314b30SXiaoye Zheng // } 16140314b30SXiaoye Zheng // // kvm_mmu_commit_zap_page(); 16240314b30SXiaoye Zheng // goal_nr_mmu_pages = kvm.lock().arch.n_used_mmu_pages; 16340314b30SXiaoye Zheng 16440314b30SXiaoye Zheng // } 16540314b30SXiaoye Zheng // kvm.lock().arch.n_max_mmu_pages = goal_nr_mmu_pages; 16640314b30SXiaoye Zheng // } 16740314b30SXiaoye Zheng 16840314b30SXiaoye Zheng // pub fn prepare_zap_oldest_mmu_page() -> bool { 16940314b30SXiaoye Zheng // return false; 17040314b30SXiaoye Zheng // } 17140314b30SXiaoye Zheng 17240314b30SXiaoye Zheng pub fn kvm_mmu_setup(vcpu: &Mutex<VmxVcpu>) { 17340314b30SXiaoye Zheng // TODO: init_kvm_softmmu(vcpu), init_kvm_nested_mmu(vcpu) 17440314b30SXiaoye Zheng init_kvm_tdp_mmu(vcpu); 17540314b30SXiaoye Zheng } 17640314b30SXiaoye Zheng 17740314b30SXiaoye Zheng pub fn kvm_vcpu_mtrr_init(_vcpu: &Mutex<VmxVcpu>) -> Result<(), SystemError> { 17840314b30SXiaoye Zheng check_ept_features()?; 17940314b30SXiaoye Zheng Ok(()) 18040314b30SXiaoye Zheng } 18140314b30SXiaoye Zheng 18240314b30SXiaoye Zheng pub fn init_kvm_tdp_mmu(vcpu: &Mutex<VmxVcpu>) { 18340314b30SXiaoye Zheng let context = &mut vcpu.lock().mmu; 18440314b30SXiaoye Zheng context.page_fault = Some(tdp_page_fault); 18540314b30SXiaoye Zheng context.get_cr3 = Some(tdp_get_cr3); 18640314b30SXiaoye Zheng context.set_eptp = Some(tdp_set_eptp); 18740314b30SXiaoye Zheng // context.inject_page_fault = kvm_inject_page_fault; TODO: inject_page_fault 18840314b30SXiaoye Zheng // context.invlpg = nonpaging_invlpg; 18940314b30SXiaoye Zheng // context.sync_page = nonpaging_sync_page; 19040314b30SXiaoye Zheng // context.update_pte = nonpaging_update_pte; 19140314b30SXiaoye Zheng 19240314b30SXiaoye Zheng // TODO: gva to gpa in kvm 19340314b30SXiaoye Zheng // if !is_paging(vcpu) { // vcpu不分页 19440314b30SXiaoye Zheng // context.gva_to_gpa = nonpaging_gva_to_gpa; 19540314b30SXiaoye Zheng // context.root_level = 0; 19640314b30SXiaoye Zheng // } else if (is_long_mode(vcpu)) { 19740314b30SXiaoye Zheng // context.gva_to_gpa = paging64_gva_to_gpa; 19840314b30SXiaoye Zheng // context.root_level = PT64_ROOT_LEVEL; 19940314b30SXiaoye Zheng // TODO:: different paging strategy 20040314b30SXiaoye Zheng // } else if (is_pae(vcpu)) { 20140314b30SXiaoye Zheng // context.gva_to_gpa = paging64_gva_to_gpa; 20240314b30SXiaoye Zheng // context.root_level = PT32E_ROOT_LEVEL; 20340314b30SXiaoye Zheng // } else { 20440314b30SXiaoye Zheng // context.gva_to_gpa = paging32_gva_to_gpa; 20540314b30SXiaoye Zheng // context.root_level = PT32_ROOT_LEVEL; 20640314b30SXiaoye Zheng // } 20740314b30SXiaoye Zheng } 20840314b30SXiaoye Zheng 20940314b30SXiaoye Zheng pub fn __direct_map( 21040314b30SXiaoye Zheng vcpu: &mut VmxVcpu, 21140314b30SXiaoye Zheng gpa: u64, 21240314b30SXiaoye Zheng _write: u32, 21340314b30SXiaoye Zheng _map_writable: bool, 21440314b30SXiaoye Zheng _level: i32, 21540314b30SXiaoye Zheng _gfn: u64, 21640314b30SXiaoye Zheng pfn: u64, 21740314b30SXiaoye Zheng _prefault: bool, 21840314b30SXiaoye Zheng ) -> Result<u32, SystemError> { 21940314b30SXiaoye Zheng kdebug!("gpa={}, pfn={}, root_hpa={:x}", gpa, pfn, vcpu.mmu.root_hpa); 22040314b30SXiaoye Zheng // 判断vcpu.mmu.root_hpa是否有效 22140314b30SXiaoye Zheng if vcpu.mmu.root_hpa == 0 { 22240314b30SXiaoye Zheng return Err(SystemError::KVM_HVA_ERR_BAD); 22340314b30SXiaoye Zheng } 22440314b30SXiaoye Zheng // 把gpa映射到hpa 22540314b30SXiaoye Zheng let mut ept_mapper = EptMapper::lock(); 22640314b30SXiaoye Zheng let page_flags = PageFlags::from_prot_flags(ProtFlags::from_bits_truncate(0x7 as u64), false); 22740314b30SXiaoye Zheng unsafe { 22840314b30SXiaoye Zheng assert!(ept_mapper.walk(gpa, pfn << PAGE_SHIFT, page_flags).is_ok()); 22940314b30SXiaoye Zheng } 23040314b30SXiaoye Zheng drop(ept_mapper); 23140314b30SXiaoye Zheng return Ok(0); 23240314b30SXiaoye Zheng } 23340314b30SXiaoye Zheng 23440314b30SXiaoye Zheng pub fn mmu_gfn_to_pfn_fast( 23540314b30SXiaoye Zheng vcpu: &mut VmxVcpu, 23640314b30SXiaoye Zheng _gpa: u64, 23740314b30SXiaoye Zheng _prefault: bool, 23840314b30SXiaoye Zheng gfn: u64, 23940314b30SXiaoye Zheng write: bool, 24040314b30SXiaoye Zheng writable: &mut bool, 24140314b30SXiaoye Zheng ) -> Result<u64, SystemError> { 24240314b30SXiaoye Zheng let slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); 24340314b30SXiaoye Zheng let pfn = __gfn_to_pfn(slot, gfn, false, write, writable)?; 24440314b30SXiaoye Zheng Ok(pfn) 24540314b30SXiaoye Zheng } 24640314b30SXiaoye Zheng 24740314b30SXiaoye Zheng // TODO: 添加cache 24840314b30SXiaoye Zheng pub fn mmu_topup_memory_caches(_vcpu: &mut VmxVcpu) -> Result<(), SystemError> { 24940314b30SXiaoye Zheng // 如果 vcpu->arch.mmu_page_header_cache 不足,从 mmu_page_header_cache 中分配 25040314b30SXiaoye Zheng // pte_list_desc_cache 和 mmu_page_header_cache 两块全局 slab cache 在 kvm_mmu_module_init 中被创建 25140314b30SXiaoye Zheng // mmu_topup_memory_cache(vcpu.mmu_page_header_cache, 25240314b30SXiaoye Zheng // mmu_page_header_cache, 4); 25340314b30SXiaoye Zheng Ok(()) 25440314b30SXiaoye Zheng } 255